From ed162d017650c6571b0f80b56ead6fdb7ff33595 Mon Sep 17 00:00:00 2001 From: Feiyue Chen Date: Tue, 18 Oct 2022 16:55:15 +0800 Subject: [PATCH] Update internal for 22Q3 release update internal to commit-id: e2b0fde631fce349e0e3ad42b2a4d40ce7634a97 Type: Code Improvement Signed-off-by: Feiyue Chen --- src/tim/vx/internal/include/interface/ops.def | 4 + .../internal/include/kernel/vsi_nn_kernel.h | 11 + .../kernel/vsi_nn_kernel_gpu_shape_optimize.h | 6 + .../include/kernel/vsi_nn_kernel_lut.h | 4 + .../vsi_nn_op_bidirectional_sequence_rnn.h | 26 +- .../include/ops/vsi_nn_op_bucketize.h | 48 + .../internal/include/ops/vsi_nn_op_conv1d.h | 1 + .../internal/include/ops/vsi_nn_op_conv2d.h | 15 + .../internal/include/ops/vsi_nn_op_conv3d.h | 1 + .../internal/include/ops/vsi_nn_op_deconv3d.h | 1 + .../include/ops/vsi_nn_op_depthwise_conv1d.h | 1 + .../include/ops/vsi_nn_op_grouped_conv1d.h | 1 + .../include/ops/vsi_nn_op_grouped_conv2d.h | 1 + .../include/ops/vsi_nn_op_l2normalizescale.h | 2 +- .../include/ops/vsi_nn_op_layernormalize.h | 10 +- .../internal/include/ops/vsi_nn_op_lppool.h | 46 + .../vx/internal/include/ops/vsi_nn_op_pad.h | 7 - .../ops/vsi_nn_op_pre_process_yuv422.h | 81 + .../vx/internal/include/ops/vsi_nn_op_rnn.h | 9 +- .../include/ops/vsi_nn_op_scatter_elements.h | 49 + .../vsi_nn_op_unidirectional_sequence_rnn.h | 8 +- .../include/utils/vsi_nn_dtype_util_prv.h | 4 +- .../vx/internal/include/utils/vsi_nn_util.h | 10 + .../vx/internal/include/vip/virtual_device.h | 18 +- src/tim/vx/internal/include/vsi_nn_context.h | 1 + src/tim/vx/internal/include/vsi_nn_graph.h | 14 + .../vx/internal/include/vsi_nn_node_type.h | 11 +- .../include/vsi_nn_pre_post_process.h | 2 + src/tim/vx/internal/include/vsi_nn_types.h | 22 + src/tim/vx/internal/include/vsi_nn_version.h | 2 +- .../src/kernel/cl/batchnorm_single_cl.c | 2 +- .../vx/internal/src/kernel/cl/bucketize_cl.c | 303 + src/tim/vx/internal/src/kernel/cl/gather_cl.c | 10 + src/tim/vx/internal/src/kernel/cl/lppool_cl.c | 332 ++ .../vx/internal/src/kernel/cl/maximum_cl.c | 2 +- .../vx/internal/src/kernel/cl/minimum_cl.c | 2 +- .../vx/internal/src/kernel/cl/roi_align_cl.c | 28 +- .../src/kernel/cl/scatter_elements_cl.c | 351 ++ .../internal/src/kernel/cpu/bucketize_cpu.c | 229 + .../vx/internal/src/kernel/cpu/lppool_cpu.c | 264 + .../vx/internal/src/kernel/cpu/maximum_cpu.c | 16 +- .../vx/internal/src/kernel/cpu/minimum_cpu.c | 16 +- .../src/kernel/cpu/pre_process_yuv422_cpu.c | 405 ++ .../internal/src/kernel/cpu/roi_align_cpu.c | 89 +- .../src/kernel/cpu/scatter_elements_cpu.c | 258 + .../internal/src/kernel/evis/bucketize_evis.c | 323 ++ .../kernel/evis/grucell_activation_z_h_evis.c | 2 +- .../kernel/evis/instance_normalization_evis.c | 428 +- .../internal/src/kernel/evis/matrixmul_evis.c | 39 + .../src/kernel/evis/pre_process_nv12_evis.c | 148 +- .../src/kernel/evis/pre_process_yuv420_evis.c | 129 +- .../src/kernel/evis/pre_process_yuv422_evis.c | 623 ++ .../src/kernel/evis/resize_1d_nearest_evis.c | 2 +- .../src/kernel/evis/resize_bilinear_evis.c | 40 +- .../src/kernel/evis/resize_nearest_evis.c | 2 +- .../vx/internal/src/kernel/evis/select_evis.c | 64 +- .../vx/internal/src/kernel/evis/tile_evis.c | 2 + .../src/kernel/sp/layer_norm_y_direction_sp.c | 797 +++ .../src/kernel/sp/softmax_z_direction_sp.c | 938 +++ .../vx/internal/src/kernel/vsi_nn_kernel.c | 60 +- .../kernel/vsi_nn_kernel_gpu_shape_optimize.c | 59 +- .../internal/src/kernel/vsi_nn_kernel_lut.c | 34 + .../vx/internal/src/kernel/vx/convolutional.c | 23 +- .../internal/src/kernel/vx/eltwise_unary_vx.c | 80 + src/tim/vx/internal/src/kernel/vx/pad2_vx.c | 11 +- .../internal/src/libnnext/ops/cl/bucketize.cl | 281 + .../vx/internal/src/libnnext/ops/cl/lppool.cl | 115 + .../internal/src/libnnext/ops/cl/maximum.cl | 4 +- .../internal/src/libnnext/ops/cl/minimum.cl | 4 +- .../internal/src/libnnext/ops/cl/roi_align.cl | 141 +- .../src/libnnext/ops/cl/scatter_elements.cl | 298 + .../libnnext/ops/cl/scatter_elements_add.cl | 292 + .../libnnext/ops/cl/scatter_elements_mul.cl | 292 + .../internal/src/libnnext/ops/vx/bucketize.vx | 176 + .../libnnext/ops/vx/group_normalization_1.vx | 2 +- .../ops/vx/instance_normalization_0.vx | 141 +- .../ops/vx/instance_normalization_1.vx | 75 +- .../ops/vx/instance_normalization_2.vx | 73 +- .../ops/vx/instance_normalization_3.vx | 78 +- .../libnnext/ops/vx/l2normalizescale_axis0.vx | 4 +- .../src/libnnext/ops/vx/matrixmul_i16.vx | 141 + .../libnnext/ops/vx/pre_process_nv12_copy.vx | 86 + .../libnnext/ops/vx/pre_process_nv12_scale.vx | 318 +- .../ops/vx/pre_process_nv12_scale_8bits.vx | 197 - .../ops/vx/pre_process_nv12_scale_mix.vx | 162 - .../ops/vx/pre_process_yuv420_copy.vx | 238 + .../ops/vx/pre_process_yuv420_copy_u8.vx | 240 - .../ops/vx/pre_process_yuv420_scale_0.vx | 237 + .../ops/vx/pre_process_yuv420_scale_1.vx | 245 + .../ops/vx/pre_process_yuv420_scale_fp16.vx | 232 - .../ops/vx/pre_process_yuv420_scale_i16.vx | 227 - .../ops/vx/pre_process_yuv420_scale_i8.vx | 227 - .../ops/vx/pre_process_yuv420_scale_u8.vx | 228 - .../ops/vx/pre_process_yuv422_copy.vx | 88 + .../ops/vx/pre_process_yuv422_scale.vx | 132 + .../vx/internal/src/libnnext/ops/vx/select.vx | 95 +- .../src/libnnext/vsi_nn_libnnext_resource.c | 5019 ++++++++++------- .../internal/src/libnnext/vsi_nn_vxkernel.c | 2 +- src/tim/vx/internal/src/makefile.linux | 293 +- .../internal/src/ops/vsi_nn_op_batch2space.c | 93 +- .../vsi_nn_op_bidirectional_sequence_rnn.c | 129 +- .../vx/internal/src/ops/vsi_nn_op_bucketize.c | 208 + src/tim/vx/internal/src/ops/vsi_nn_op_clip.c | 23 +- .../vx/internal/src/ops/vsi_nn_op_conv1d.c | 2 + .../vx/internal/src/ops/vsi_nn_op_conv2d.c | 366 +- .../src/ops/vsi_nn_op_conv2d_lstm_cell.c | 2 + .../vx/internal/src/ops/vsi_nn_op_conv3d.c | 1 + .../internal/src/ops/vsi_nn_op_dataconvert.c | 5 + .../src/ops/vsi_nn_op_deconvolution1d.c | 11 +- .../src/ops/vsi_nn_op_depthwise_conv1d.c | 2 + .../vx/internal/src/ops/vsi_nn_op_eltwise.c | 19 +- .../src/ops/vsi_nn_op_expand_broadcast.c | 6 +- .../src/ops/vsi_nn_op_grouped_conv1d.c | 1 + .../src/ops/vsi_nn_op_grouped_conv2d.c | 1 + .../src/ops/vsi_nn_op_l2normalizescale.c | 17 +- .../src/ops/vsi_nn_op_layernormalize.c | 58 +- .../vx/internal/src/ops/vsi_nn_op_lppool.c | 259 + .../src/ops/vsi_nn_op_lstmunit_activation.c | 20 +- .../src/ops/vsi_nn_op_maxpoolwithargmax.c | 8 +- src/tim/vx/internal/src/ops/vsi_nn_op_pad.c | 12 +- src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c | 29 +- src/tim/vx/internal/src/ops/vsi_nn_op_pool.c | 14 +- .../internal/src/ops/vsi_nn_op_pre_process.c | 68 +- .../src/ops/vsi_nn_op_pre_process_nv12.c | 10 +- .../src/ops/vsi_nn_op_pre_process_yuv420.c | 8 +- .../src/ops/vsi_nn_op_pre_process_yuv422.c | 238 + .../vx/internal/src/ops/vsi_nn_op_reduce.c | 2 +- .../src/ops/vsi_nn_op_reduce_mean_internal.c | 2 +- src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c | 21 +- src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c | 22 +- src/tim/vx/internal/src/ops/vsi_nn_op_relun.c | 30 +- .../vx/internal/src/ops/vsi_nn_op_repeat.c | 10 +- .../vx/internal/src/ops/vsi_nn_op_reshape.c | 4 +- .../vx/internal/src/ops/vsi_nn_op_resize.c | 10 + .../src/ops/vsi_nn_op_rnncell_ovxlib.c | 21 +- .../vx/internal/src/ops/vsi_nn_op_roi_pool.c | 1 + src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c | 21 +- .../src/ops/vsi_nn_op_scatter_elements.c | 171 + .../vx/internal/src/ops/vsi_nn_op_select.c | 47 +- .../vx/internal/src/ops/vsi_nn_op_softrelu.c | 22 +- .../internal/src/ops/vsi_nn_op_space2batch.c | 124 +- src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c | 22 +- .../vx/internal/src/ops/vsi_nn_op_squeeze.c | 2 +- .../vsi_nn_op_unidirectional_sequence_rnn.c | 34 +- .../src/utils/vsi_nn_code_generator.c | 4 + .../src/utils/vsi_nn_constraint_check.c | 7 +- src/tim/vx/internal/src/utils/vsi_nn_dtype.c | 26 +- src/tim/vx/internal/src/utils/vsi_nn_util.c | 42 + src/tim/vx/internal/src/vsi_nn_context.c | 13 +- src/tim/vx/internal/src/vsi_nn_graph.c | 124 + .../internal/src/vsi_nn_graph_optimization.c | 14 +- .../internal/src/vsi_nn_node_attr_template.c | 2 + src/tim/vx/internal/src/vsi_nn_tensor.c | 49 +- 153 files changed, 14300 insertions(+), 5067 deletions(-) create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_bucketize.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_lppool.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_scatter_elements.h create mode 100644 src/tim/vx/internal/src/kernel/cl/bucketize_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/lppool_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/evis/bucketize_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c create mode 100644 src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c create mode 100644 src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/bucketize.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/lppool.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_add.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_mul.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bucketize.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index ae52716..045eb95 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -179,3 +179,7 @@ DEF_OP(SOFTSIGN) DEF_OP(CUMSUM) DEF_OP(MAXPOOLWITHARGMAX) DEF_OP(MOD) +DEF_OP(LPPOOL) +DEF_OP(SCATTER_ELEMENTS) +DEF_OP(PRE_PROCESS_YUV422) +DEF_OP(BUCKETIZE) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h index 7d75720..d2c4e58 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h @@ -326,9 +326,20 @@ const void * vsi_nn_kernel_param_get_const_buffer } \ static vsi_status NAME##_impl +#define DEF_SP_KERNEL_BASE_CALLBACK( NAME ) \ + static vsi_status NAME##_impl( vsi_nn_kernel_node_t node); \ + static vx_status VX_CALLBACK NAME( \ + vx_node node) {\ + return (vx_status)NAME##_impl( \ + (vsi_nn_kernel_node_t)node); \ + } \ + static vsi_status NAME##_impl + + #define DEF_KERNEL_INITIALIZER( NAME ) DEF_KERNEL_BASE_CALLBACK( NAME ) #define DEF_KERNEL_EXECUTOR( NAME ) DEF_KERNEL_BASE_CALLBACK( NAME ) #define DEF_KERNEL_DEINITIALIZER( NAME ) DEF_KERNEL_BASE_CALLBACK( NAME ) +#define DEF_SP_KERNEL_QUERY( NAME ) DEF_SP_KERNEL_BASE_CALLBACK( NAME ) void vsi_nn_kernel_backend_register ( diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h index 26a676f..cfecfd1 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h @@ -85,4 +85,10 @@ vsi_bool vsi_nn_kernel_optimize_group_norm_shape int32_t is_sp_kernel, vsi_size_t* out_shape ); +vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape + ( + const vsi_size_t* shape_x, const vsi_size_t rank_x, const int32_t axis, + vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size + ); + #endif diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h index c872cca..f413b81 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h @@ -48,6 +48,10 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum VSI_NN_KERNEL_LUT_CELU = 14, VSI_NN_KERNEL_LUT_RCP = 15, VSI_NN_KERNEL_LUT_SOFTSIGN = 16, + VSI_NN_KERNEL_LUT_LINEAR_EXP = 17, + VSI_NN_KERNEL_LUT_LINEAR_RSQRT = 18, + VSI_NN_KERNEL_LUT_LINEAR_SIGMOID = 19, + }; #define VSI_NN_KERNEL_LUT_MAX_SIZE (1024) diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h index 2bf8c77..82aa777 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h @@ -38,22 +38,26 @@ enum BI_RNN_FW_INPUT_WEIGHT_I = 1, BI_RNN_FW_INPUT_WEIGHT_H = 2, - BI_RNN_FW_INPUT_BIAS = 3, - BI_RNN_FW_INPUT_H_STATE = 4, + BI_RNN_FW_INPUT_BIAS_I = 3, + BI_RNN_FW_INPUT_BIAS_H = 4, + BI_RNN_FW_INPUT_H_STATE = 5, - BI_RNN_BW_INPUT_WEIGHT_I = 5, - BI_RNN_BW_INPUT_WEIGHT_H = 6, - BI_RNN_BW_INPUT_BIAS = 7, - BI_RNN_BW_INPUT_H_STATE = 8, + BI_RNN_BW_INPUT_WEIGHT_I = 6, + BI_RNN_BW_INPUT_WEIGHT_H = 7, + BI_RNN_BW_INPUT_BIAS_I = 8, + BI_RNN_BW_INPUT_BIAS_H = 9, + BI_RNN_BW_INPUT_H_STATE = 10, - BI_RNN_AUX_INPUT = 9, - BI_RNN_FW_AUX_INPUT_WEIGHT = 10, - BI_RNN_BW_AUX_INPUT_WEIGHT = 11, + BI_RNN_AUX_INPUT = 11, + BI_RNN_FW_AUX_INPUT_WEIGHT = 12, + BI_RNN_BW_AUX_INPUT_WEIGHT = 13, BI_RNN_INPUT_CNT, - BI_RNN_FW_OUTPUT_OUTPUT = 0, - BI_RNN_BW_OUTPUT_OUTPUT = 1, + BI_RNN_FW_OUTPUT_H_STATE = 0, + BI_RNN_BW_OUTPUT_H_STATE = 1, + BI_RNN_FW_OUTPUT_OUTPUT = 2, + BI_RNN_BW_OUTPUT_OUTPUT = 3, BI_RNN_OUTPUT_CNT }; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bucketize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bucketize.h new file mode 100644 index 0000000..501b117 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bucketize.h @@ -0,0 +1,48 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_BUCKETIZE_H +#define _VSI_NN_OP_BUCKETIZE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_bucketize_param +{ + struct _bucketize_local_data_t* local; + // Add parameters here + vsi_bool right; +} vsi_nn_bucketize_param; +_compiler_assert(offsetof(vsi_nn_bucketize_param, local) == 0, \ + vsi_nn_bucketize_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h index 5fa5041..504d984 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h @@ -54,6 +54,7 @@ typedef struct _vsi_nn_conv1d_param uint32_t group; uint32_t dilation; int32_t multiplier; + vsi_nn_pad_mode_e pad_mode; } vsi_nn_conv1d_param; _compiler_assert(offsetof(vsi_nn_conv1d_param, local) == 0, \ vsi_nn_vsi_nn_conv1d_h ); diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h index 282c988..55f882b 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h @@ -30,6 +30,20 @@ extern "C" { #endif +typedef struct _vsi_nn_conv2d_param_deprecate +{ + uint32_t ksize[2]; + uint32_t stride[2]; + /* Pad left, right, top, bottom */ + uint32_t pad[4]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; + uint32_t weights; + uint32_t group; + uint32_t dilation[2]; + int32_t multiplier; +} vsi_nn_conv2d_param_deprecate; + typedef struct _vsi_nn_conv2d_param { uint32_t ksize[2]; @@ -42,6 +56,7 @@ typedef struct _vsi_nn_conv2d_param uint32_t group; uint32_t dilation[2]; int32_t multiplier; + vsi_nn_pad_mode_e pad_mode; } vsi_nn_conv2d_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h index bf8bf2b..590eaa4 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h @@ -47,6 +47,7 @@ typedef struct _vsi_nn_conv3d_param int32_t weights; int32_t multiplier; + vsi_nn_pad_mode_e pad_mode; } vsi_nn_conv3d_param; _compiler_assert(offsetof(vsi_nn_conv3d_param, local) == 0, \ vsi_nn_conv3d_h ); diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h index 133267f..923fa7f 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h @@ -43,6 +43,7 @@ typedef struct _vsi_nn_deconv3d_param uint32_t weights; uint32_t group; uint32_t output_padding[3]; + vsi_nn_pad_mode_e pad_mode; } vsi_nn_deconv3d_param; _compiler_assert(offsetof(vsi_nn_deconv3d_param, local) == 0, \ vsi_nn_deconv3d_h ); diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h index 7f7f66f..f3d03a7 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h @@ -36,6 +36,7 @@ typedef struct _vsi_nn_depthwise_conv1d_param uint32_t pad[2]; uint32_t dilation; int32_t multiplier; + vsi_nn_pad_mode_e pad_mode; } vsi_nn_depthwise_conv1d_param; __END_DECLS diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h index fa571e9..d23c10b 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h @@ -51,6 +51,7 @@ typedef struct _vsi_nn_grouped_conv1d_param uint32_t group; uint32_t dilation; int32_t multiplier; + vsi_nn_pad_mode_e pad_mode; } vsi_nn_grouped_conv1d_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h index 59858c0..f78b8ea 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h @@ -43,6 +43,7 @@ typedef struct _vsi_nn_grouped_conv2d_param uint32_t dilation[2]; int32_t multiplier; void* local; + vsi_nn_pad_mode_e pad_mode; } vsi_nn_grouped_conv2d_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h b/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h index b15ee4e..e6fe704 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h @@ -86,7 +86,7 @@ typedef struct _vsi_nn_l2normalizescale_lcl_data { vx_tensor local_tensor[_VSI_NN_L2NORMALIZESCALE_LOCAL_TENSOR_NUM]; uint32_t hash_idx; - vsi_bool execute_on_sw; + vsi_bool use_internal_node; } vsi_nn_l2normalizescale_lcl_data; typedef struct _vsi_nn_l2normalizescale_param diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h index 91501bb..cef6647 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h @@ -35,14 +35,20 @@ extern "C" { typedef struct _vsi_nn_layernorm_lcl_data { - vx_tensor local_tensor[_VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM]; + vsi_bool use_internal_node; } vsi_nn_layernorm_lcl_data; typedef struct _vsi_nn_layernormalize_param { /* local data must be the first. */ - vsi_nn_layernorm_lcl_data local; + union + { + vx_tensor local_tensor[_VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM]; + vsi_nn_layernorm_lcl_data *local; + }; + float eps; + int32_t axis; } vsi_nn_layernormalize_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lppool.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lppool.h new file mode 100644 index 0000000..84a8f95 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lppool.h @@ -0,0 +1,46 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_LPPOOL_H +#define _VSI_NN_OP_LPPOOL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_lppool_param { + vsi_nn_pad_e pad_type; + uint32_t ksize[2]; + int32_t p; + uint32_t pad[4]; + uint32_t stride[2]; +} vsi_nn_lppool_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h index 7e7d5d1..91ef2c4 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h @@ -30,13 +30,6 @@ extern "C" { #endif -typedef enum { - VSI_NN_PAD_MODE_CONSTANT, - VSI_NN_PAD_MODE_REPLICATE, - VSI_NN_PAD_MODE_SYMMETRIC, - VSI_NN_PAD_MODE_REFLECT, -}vsi_nn_pad_mode_e; - typedef struct _vsi_nn_pad_param { const uint32_t * front_size; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h new file mode 100644 index 0000000..b516e60 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h @@ -0,0 +1,81 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_PRE_PROCESS_YUV422_H +#define _VSI_NN_OP_PRE_PROCESS_YUV422_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define _VSI_NN_PRE_PROCESS_YUV422_LOCAL_TENSOR_NUM 2 + +typedef struct _vsi_nn_pre_process_yuv422_lcl_data +{ + int32_t scale_x; + int32_t scale_y; + vsi_bool enable_copy; + vsi_bool enable_perm; + vx_tensor local_tensor[_VSI_NN_PRE_PROCESS_YUV422_LOCAL_TENSOR_NUM]; +} vsi_nn_pre_process_yuv422_lcl_data; + +typedef struct _vsi_nn_pre_process_yuv422_param +{ + vsi_nn_pre_process_yuv422_lcl_data* local; + + vsi_nn_yuv_type yuv422_type; + + struct + { + uint32_t left; + uint32_t top; + uint32_t width; + uint32_t height; + } rect; + + struct + { + vsi_size_t *size; + uint32_t dim_num; + } output_attr; + + uint32_t * perm; + uint32_t dim_num; + + float r_mean; + float g_mean; + float b_mean; + float rgb_scale; + + vsi_bool reverse_channel; +} vsi_nn_pre_process_yuv422_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h index 0083c78..3e50d0a 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h @@ -37,11 +37,12 @@ enum RNNCELL_INPUT_INPUT = 0, RNNCELL_INPUT_WEIGHT_I = 1, RNNCELL_INPUT_WEIGHT_H = 2, - RNNCELL_INPUT_BIAS = 3, - RNNCELL_INPUT_H_STATE = 4, + RNNCELL_INPUT_BIAS_I = 3, + RNNCELL_INPUT_BIAS_H = 4, + RNNCELL_INPUT_H_STATE = 5, - RNNCELL_INPUT_AUX_INPUT = 5, - RNNCELL_INPUT_AUX_WEIGHT = 6, + RNNCELL_INPUT_AUX_INPUT = 6, + RNNCELL_INPUT_AUX_WEIGHT = 7, RNNCELL_INPUT_CNT, RNNCELL_OUTPUT_H_STATE = 0, diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_elements.h b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_elements.h new file mode 100644 index 0000000..c12fc85 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_elements.h @@ -0,0 +1,49 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_SCATTER_ELEMENTS_H +#define _VSI_NN_OP_SCATTER_ELEMENTS_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_scatter_elements_param +{ + struct _scatter_elements_local_data_t* local; + // Add parameters here + int32_t axis; + vsi_nn_reduction_type_e reduction; +} vsi_nn_scatter_elements_param; +_compiler_assert(offsetof(vsi_nn_scatter_elements_param, local) == 0, \ + vsi_nn_scatter_elements_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h index 985fe22..bf87649 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h @@ -37,11 +37,13 @@ enum RNN_INPUT_INPUT = 0, RNN_INPUT_WEIGHT_I = 1, RNN_INPUT_WEIGHT_H = 2, - RNN_INPUT_BIAS = 3, - RNN_INPUT_H_STATE = 4, + RNN_INPUT_BIAS_I = 3, + RNN_INPUT_BIAS_H = 4, + RNN_INPUT_H_STATE = 5, RNN_INPUT_CNT, - RNN_OUTPUT_OUTPUT = 0, + RNN_OUTPUT_H_STATE = 0, + RNN_OUTPUT_OUTPUT = 1, RNN_OUTPUT_CNT }; diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h index 4e19fc0..7eaec28 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -253,11 +253,11 @@ static VSI_INLINE_API int32_t fp32_to_dfp type_get_range( type, &max_range, &min_range ); if( fl > 0 ) { - data = (int32_t)vsi_rint( in * (float)( (int64_t)1 << fl ) ); + data = (int32_t)vsi_rint( in * (double)( (int64_t)1 << fl ) ); } else { - data = (int32_t)vsi_rint( in * ( 1.0f / (float)( (int64_t)1 << -fl ) ) ); + data = (int32_t)vsi_rint( in * ( 1.0f / (double)( (int64_t)1 << -fl ) ) ); } data = vsi_nn_min( data, (int32_t)max_range ); data = vsi_nn_max( data, (int32_t)min_range ); diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h index 77b3cb6..f939592 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -468,6 +468,16 @@ FILE* vsi_nn_fopen const char * mode ); +int32_t vsi_nn_get_vx_pad_mode + ( + vsi_nn_pad_mode_e mode + ); + +vsi_bool vsi_nn_is_3d_tensor + ( + vsi_nn_tensor_t * tensor + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vip/virtual_device.h b/src/tim/vx/internal/include/vip/virtual_device.h index a314a86..a91ef83 100644 --- a/src/tim/vx/internal/include/vip/virtual_device.h +++ b/src/tim/vx/internal/include/vip/virtual_device.h @@ -27,6 +27,8 @@ #include #include +#include "vsi_nn_pub.h" + struct _vsi_nn_graph; typedef struct _vsi_nn_graph vsi_nn_graph_t; @@ -38,13 +40,13 @@ using data_t = const void*; class IDevice { public: - IDevice(uint32_t id); - ~IDevice(); - uint32_t Id() const; - bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data); - bool GraphRemove(const vsi_nn_graph_t* graph); - bool ThreadExit(); - void WaitThreadIdle(); + OVXLIB_API IDevice(uint32_t id); + OVXLIB_API ~IDevice(); + OVXLIB_API uint32_t Id() const; + OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data); + OVXLIB_API bool GraphRemove(const vsi_nn_graph_t* graph); + OVXLIB_API bool ThreadExit(); + OVXLIB_API void WaitThreadIdle(); protected: Device* device_; @@ -52,4 +54,4 @@ class IDevice { } // namespace vip -#endif \ No newline at end of file +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index 95591ca..f5ace92 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -76,6 +76,7 @@ typedef struct _vsi_nn_runtime_option_t int32_t enable_opcheck; int32_t enable_concat_optimize; int32_t enable_asymi8_to_u8; + int32_t enable_dataconvert_optimize; } vsi_nn_runtime_option_t; /** diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h index dda35b7..c9c0687 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph.h +++ b/src/tim/vx/internal/include/vsi_nn_graph.h @@ -751,6 +751,20 @@ OVXLIB_API vsi_bool vsi_nn_IsGraphFastMode ( const vsi_nn_graph_t* graph ); + +OVXLIB_API vsi_status vsi_nn_CopyTensorViaGraphs + ( + vsi_nn_graph_t *src_graph, + vsi_nn_tensor_id_t src_tensor_id, + vsi_nn_graph_t *dst_graph, + vsi_nn_tensor_id_t dst_tensor_id + ); + +OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t *max_iteration_tensor + ); #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index 5c170df..d41e0f0 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -196,6 +196,10 @@ #include "ops/vsi_nn_op_softsign.h" #include "ops/vsi_nn_op_cumsum.h" #include "ops/vsi_nn_op_mod.h" +#include "ops/vsi_nn_op_lppool.h" +#include "ops/vsi_nn_op_scatter_elements.h" +#include "ops/vsi_nn_op_pre_process_yuv422.h" +#include "ops/vsi_nn_op_bucketize.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" @@ -206,9 +210,10 @@ extern "C"{ /** Operation attributes */ typedef union _vsi_nn_nn_param { + vsi_nn_conv2d_param conv2d; struct { - vsi_nn_conv2d_param conv2d; + vsi_nn_conv2d_param_deprecate conv2d_deprecate; vsi_nn_pool_param pool; }; vsi_nn_fcl_param fcl; @@ -377,6 +382,10 @@ typedef union _vsi_nn_nn_param vsi_nn_softsign_param softsign; vsi_nn_cumsum_param cumsum; vsi_nn_mod_param mod; + vsi_nn_lppool_param lppool; + vsi_nn_scatter_elements_param scatter_elements; + vsi_nn_pre_process_yuv422_param pre_process_yuv422; + vsi_nn_bucketize_param bucketize; void* client_param; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h index 5cc2a3e..5da4b82 100644 --- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h +++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h @@ -85,6 +85,8 @@ typedef enum VSI_NN_SOURCE_FORMAT_IMAGE_YUV444, VSI_NN_SOURCE_FORMAT_IMAGE_NV12, VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP, + VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422, + VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422, } vsi_nn_preprocess_source_format_e; /** diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h index 076f493..0a655c1 100644 --- a/src/tim/vx/internal/include/vsi_nn_types.h +++ b/src/tim/vx/internal/include/vsi_nn_types.h @@ -111,6 +111,22 @@ typedef enum VSI_NN_PAD_SAME } vsi_nn_pad_e; +/** reduce type enum */ +typedef enum +{ + VSI_NN_REDUCTION_TYPE_NONE, + VSI_NN_REDUCTION_TYPE_ADD, + VSI_NN_REDUCTION_TYPE_MUL +} vsi_nn_reduction_type_e; + +/** Pad mode enum */ +typedef enum { + VSI_NN_PAD_MODE_CONSTANT, + VSI_NN_PAD_MODE_REPLICATE, + VSI_NN_PAD_MODE_SYMMETRIC, + VSI_NN_PAD_MODE_REFLECT, +} vsi_nn_pad_mode_e; + /** * @deprecated Platform enum * @see vsi_nn_dim_fmt_e @@ -235,6 +251,12 @@ typedef enum _vsi_nn_con2d_lstm_dataformat CONV2D_LSTM_CHANNELS_FIRST } vsi_nn_con2d_lstm_dataformat; +typedef enum _vsi_nn_yuv_type +{ + VSI_NN_YUV_TYPE_YUYV422, + VSI_NN_YUV_TYPE_UYUV422 +}vsi_nn_yuv_type; + /** Deprecated */ typedef uint32_t vsi_nn_size_t; diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 711c498..5079bfe 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 50 +#define VSI_NN_VERSION_PATCH 57 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c index 31a5223..c62f0b4 100644 --- a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c @@ -56,7 +56,7 @@ __BEGIN_DECLS VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME }, #define HASH_BATCH_NORM_SH_KERNEL_2D_NAME( SRC_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("batch_norm_"#SRC_TYPE"to"#DST_TYPE"_2D") + CVIVANTE_NAMESPACE("cl.batch_norm_"#SRC_TYPE"to"#DST_TYPE"_2D") #define TENSOR_BATCH_NORM_KERNELS_2D( SRC_TYPE, OUT_TYPE) \ { HASH_BATCH_NORM_KEY( SRC_TYPE, OUT_TYPE, 1), \ diff --git a/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c new file mode 100644 index 0000000..e20cb1b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c @@ -0,0 +1,303 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_BUCKETIZE, +} _internal_kernel_e; + +#define STR(a) #a + +// Add kernel hashtable here +#define BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \ + (( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 ) | (RIGHT << 24) | (IMG_2D << 25)) + +#define PACK_KERNEL_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \ + { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \ + CVIVANTE_NAMESPACE("cl.bucketize_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ + "bucketize" } +#define PACK_KERNEL_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \ + { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \ + CVIVANTE_NAMESPACE("cl.bucketize_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + "bucketize" } +#define PACK_KERNEL_RIGHT_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \ + { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \ + CVIVANTE_NAMESPACE("cl.bucketize_right_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ + "bucketize" } +#define PACK_KERNEL_RIGHT_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \ + { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \ + CVIVANTE_NAMESPACE("cl.bucketize_right_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + "bucketize" } + +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + PACK_KERNEL_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \ + PACK_KERNEL_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \ + PACK_KERNEL_RIGHT_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \ + PACK_KERNEL_RIGHT_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ), + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _bucketize_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, I32 ) + PACK_KERNEL_MAP( I32, I32, I32 ) + PACK_KERNEL_MAP( U32, U32, I32 ) + PACK_KERNEL_MAP( BF16, BF16, I32 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _bucketize_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _BUCKETIZE_PARAM_NUM _cnt_of_array( _bucketize_kernel_param_def ) +#define SCALAR_BOUNDARIES_VALUE (3) +#define SCALAR_SCALE0_VALUE (4) +#define SCALAR_TAIL0_VALUE (5) +#define SCALAR_SCALE1_VALUE (6) +#define SCALAR_TAIL1_VALUE (7) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_bucketize_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_size_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); +#undef SAFE_FREE_TENSOR_ATTR + return status; +} /* _bucketize_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t right, + vsi_bool is_img2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _bucketize_kernel_map; + size_t kernel_map_size = _cnt_of_array( _bucketize_kernel_map ); + vx_param_description_t * param_def = _bucketize_kernel_param_def; + vx_kernel_initialize_f initializer = _bucketize_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + +#define _PACK_SELECT_KEY( in0_dtype, in1_dtype ) \ + ( ( in0_dtype ) | ( in1_dtype << 8 )) + + switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype)) + { + case _PACK_SELECT_KEY(F32, F32): + case _PACK_SELECT_KEY(F16, F16): + key = BUCKETIZE_HASH_KEY( F32, F32, out_dtype, right, is_img2d ); + break; + case _PACK_SELECT_KEY(I8, I8): + case _PACK_SELECT_KEY(I16, I16): + case _PACK_SELECT_KEY(I32, I32): + key = BUCKETIZE_HASH_KEY( I32, I32, out_dtype, right, is_img2d ); + break; + case _PACK_SELECT_KEY(U8, U8): + case _PACK_SELECT_KEY(U16, U16): + case _PACK_SELECT_KEY(U32, U32): + key = BUCKETIZE_HASH_KEY( U32, U32, out_dtype, right, is_img2d ); + break; + default: + key = BUCKETIZE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, right, is_img2d ); + break; + } +#undef _PACK_SELECT_KEY + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _bucketize_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + float input0_scale= vsi_nn_get_tensor_scale(inputs[0]); + float input0_tail = -input0_scale * (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input1_scale= vsi_nn_get_tensor_scale(inputs[1]); + float input1_tail = -input0_scale * (float)vsi_nn_get_tensor_zero_point(inputs[1]); + int32_t boundaries_size = (int32_t)inputs[1]->attr.size[0]; + vsi_bool image_2d = FALSE; + int32_t right = vsi_nn_kernel_param_get_int32( params, "right" ); + + if( !vsi_nn_kernel_gpu_check_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num ) || + boundaries_size >= GPU_TENSOR_MAX_WIDTH ) + { + return NULL; + } + + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + + status = _query_kernel( kernel, inputs, outputs, right, image_2d ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_BOUNDARIES_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &boundaries_size ); + node_params[SCALAR_SCALE0_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale ); + node_params[SCALAR_TAIL0_VALUE] = vsi_nn_kernel_scalar_create(graph, F32, &input0_tail ); + node_params[SCALAR_SCALE1_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale ); + node_params[SCALAR_TAIL1_VALUE] = vsi_nn_kernel_scalar_create(graph, F32, &input1_tail ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_BOUNDARIES_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE0_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL0_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE1_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL1_VALUE] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( bucketize, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c index f04c62f..66eb842 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c @@ -252,6 +252,16 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (input0_dtype == I8) + { + input0_dtype = I32; + } + + if (output_dtype == I8) + { + output_dtype = I32; + } + key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0, is_batch ); for ( i = 0; i < _cnt_of_array(gather_map); i ++ ) diff --git a/src/tim/vx/internal/src/kernel/cl/lppool_cl.c b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c new file mode 100644 index 0000000..514bec0 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c @@ -0,0 +1,332 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_LPPOOL, +} _internal_kernel_e; + +#define _LPPOOL_KERNEL_SOURCE_NAME "lppool" + +// Add kernel hashtable here +#define LPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define LPPOOL_KERNELS( IN_DTYPE, OUT_DTYPE ) \ + { LPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.lppool_"#IN_DTYPE"to"#OUT_DTYPE), \ + _LPPOOL_KERNEL_SOURCE_NAME }, \ + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _lppool_kernel_map[] = +{ + // Register kernel here + LPPOOL_KERNELS( F32, F32 ) + LPPOOL_KERNELS( F32, U32 ) + LPPOOL_KERNELS( F32, I32 ) + LPPOOL_KERNELS( U32, U32 ) + LPPOOL_KERNELS( U32, F32 ) + LPPOOL_KERNELS( I32, I32 ) + LPPOOL_KERNELS( I32, F32 ) + LPPOOL_KERNELS( BF16, BF16 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _lppool_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _LPPOOL_PARAM_NUM _cnt_of_array( _lppool_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_lppool_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vx_tensor output = (vx_tensor)param[1]; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_size_array_t *output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]; + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = (output_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2]; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + + return status; +} /* _lppool_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _lppool_kernel_map; + size_t kernel_map_size = _cnt_of_array( _lppool_kernel_map ); + vx_param_description_t * param_def = _lppool_kernel_param_def; + vx_kernel_initialize_f initializer = _lppool_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + +#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \ + (( in_dtype ) | (out_dtype << 8 )) + switch (_PACK_SELECT_KEY(in_dtype, out_dtype)) + { + case _PACK_SELECT_KEY(F32, F32): + case _PACK_SELECT_KEY(F16, F16): + case _PACK_SELECT_KEY(F32, F16): + case _PACK_SELECT_KEY(F16, F32): + key = LPPOOL_HASH_KEY( F32, F32); + break; + case _PACK_SELECT_KEY(F32, U8): + case _PACK_SELECT_KEY(F16, U8): + key = LPPOOL_HASH_KEY( F32, U32); + break; + case _PACK_SELECT_KEY(F32, I8): + case _PACK_SELECT_KEY(F32, I16): + case _PACK_SELECT_KEY(F16, I8): + case _PACK_SELECT_KEY(F16, I16): + key = LPPOOL_HASH_KEY( F32, I32); + break; + case _PACK_SELECT_KEY(U8, U8): + key = LPPOOL_HASH_KEY( U32, U32); + break; + case _PACK_SELECT_KEY(U8, F16): + case _PACK_SELECT_KEY(U8, F32): + key = LPPOOL_HASH_KEY( U32, F32); + break; + case _PACK_SELECT_KEY(I8, I8): + case _PACK_SELECT_KEY(I8, I16): + case _PACK_SELECT_KEY(I16, I8): + case _PACK_SELECT_KEY(I16, I16): + key = LPPOOL_HASH_KEY( I32, I32); + break; + case _PACK_SELECT_KEY(I8, F16): + case _PACK_SELECT_KEY(I8, F32): + case _PACK_SELECT_KEY(I16, F16): + case _PACK_SELECT_KEY(I16, F32): + key = LPPOOL_HASH_KEY( I32, F32); + break; + default: + key = LPPOOL_HASH_KEY( in_dtype, out_dtype); + break; + } +#undef _PACK_SELECT_KEY + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _lppool_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LPPOOL_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x"); + int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y"); + int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x"); + int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y"); + int32_t pad_left = vsi_nn_kernel_param_get_int32(params, "pad_left"); + int32_t pad_top = vsi_nn_kernel_param_get_int32(params, "pad_top"); + int32_t p = vsi_nn_kernel_param_get_int32(params, "p"); + int32_t width = (int32_t)inputs[0]->attr.size[0]; + int32_t height = (int32_t)inputs[0]->attr.size[1]; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + + if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num )) + { + return NULL; + } + + outputScale = 1.0f / outputScale; + inputTail = -(inputTail * inputScale); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + uint32_t index = 2; + vsi_nn_kernel_node_pack_io( node_params, _LPPOOL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &p ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _LPPOOL_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + vsi_nn_kernel_scalar_release( &node_params[13] ); + vsi_nn_kernel_scalar_release( &node_params[14] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( lppool, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c index c692265..c81289e 100644 --- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c @@ -250,7 +250,7 @@ static vsi_nn_kernel_node_t _setup float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; float outputScale = vsi_nn_get_tensor_scale(outputs[0]); - float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c index e5fe695..92a19a3 100644 --- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c @@ -250,7 +250,7 @@ static vsi_nn_kernel_node_t _setup float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; float outputScale = vsi_nn_get_tensor_scale(outputs[0]); - float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c index c36851e..d82816c 100644 --- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c @@ -87,6 +87,7 @@ static vx_param_description_t _roi_align_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _ROI_ALIGN_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def ) @@ -103,8 +104,9 @@ static vx_param_description_t _roi_align_kernel_param_def[] = #define SCALAR_SAMPLING_X_RATIO (14) #define SCALAR_SAMPLING_Y_RATIO (15) #define SCALAR_DEPTH (16) +#define SCALAR_FORMAT (17) -#define ROI_ALIGN_PARAM_NUM 17 +#define ROI_ALIGN_PARAM_NUM 18 #define ROI_ALIGN_QUANT_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def ) /* @@ -143,12 +145,8 @@ DEF_KERNEL_INITIALIZER(_roi_align_initializer) gpu_param.global_scale[2] = 1; gpu_param.dim = 3; - gpu_param.global_size[0] = gpu_align_p2( - (out_shape->data[0] + gpu_param.global_scale[0] - 1) - / gpu_param.global_scale[0], 4); - gpu_param.global_size[1] = ( - (out_shape->data[1] + gpu_param.global_scale[1] - 1) - / gpu_param.global_scale[1]); + gpu_param.global_size[0] = out_shape->data[0]; + gpu_param.global_size[1] = out_shape->data[1]; gpu_param.global_size[2] = rois_shape->data[1]; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); @@ -213,7 +211,8 @@ static vsi_status _query_kernel kernel->info.numParams = (uint32_t)param_def_size; kernel->info.initialize = initializer; // Register code source - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", kernel_map[i].source_name ); // Register binary source vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, @@ -259,8 +258,8 @@ static vsi_nn_kernel_node_t _setup float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); float width_scale = roi_scale / width_ratio; float height_scale = roi_scale / height_ratio; - float in_width = (float)(inputs[0]->attr.size[0]); - float in_height = (float)(inputs[0]->attr.size[1]); + int32_t in_width = (int32_t)(inputs[0]->attr.size[0]); + int32_t in_height = (int32_t)(inputs[0]->attr.size[1]); float rcp_of_out_width = 1.0f / (float)(outputs[0]->attr.size[0]); float rcp_of_out_height = 1.0f / (float)(outputs[0]->attr.size[1]); float sampling_x_ratio = width_sample_num > 0 ? (float)width_sample_num : 0; @@ -294,6 +293,8 @@ static vsi_nn_kernel_node_t _setup if ( VSI_SUCCESS == status ) { + int32_t out_dtype = (int32_t)vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + int32_t dtype = out_dtype == F16 ? 1 : out_dtype == F32 ? 2 : 0; size_t node_params_num = ROI_ALIGN_PARAM_NUM; node = vsi_nn_kernel_create_node( graph, kernel ); @@ -309,13 +310,14 @@ static vsi_nn_kernel_node_t _setup node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); node_params[SCALAR_SPATIAL_X_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &width_scale ); node_params[SCALAR_SPATIAL_Y_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &height_scale ); - node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &in_width ); - node_params[SCALAR_INPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &in_height ); + node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, I32, &in_width ); + node_params[SCALAR_INPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, I32, &in_height ); node_params[SCALAR_RCP_OF_OUTPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_width ); node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_height ); node_params[SCALAR_SAMPLING_X_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &sampling_x_ratio ); node_params[SCALAR_SAMPLING_Y_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &sampling_y_ratio ); node_params[SCALAR_DEPTH] = vsi_nn_kernel_scalar_create( graph, I32, &depth ); + node_params[SCALAR_FORMAT] = vsi_nn_kernel_scalar_create( graph, I32, &dtype ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); @@ -332,6 +334,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_X_RATIO] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_Y_RATIO] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_FORMAT] ); } } diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c new file mode 100644 index 0000000..2be6a78 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c @@ -0,0 +1,351 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_SCATTER_ELEMENTS, +} _internal_kernel_e; + +#define _KERNEL_SOURCE0 "scatter_elements" +#define _KERNEL_SOURCE1 "scatter_elements_add" +#define _KERNEL_SOURCE2 "scatter_elements_mul" + +#define STR(a) #a +// Add kernel hashtable here +#define SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, REDUCTION ) \ + (( IN0_DTYPE ) | ( IN2_DTYPE << 8 ) | ( OUT_DTYPE << 16 ) | ( AXIS << 24 ) | ( REDUCTION << 28 )) + +#define PACK_KERNEL_NONE_MAP( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS ) \ + { SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, VSI_NN_REDUCTION_TYPE_NONE ), \ + CVIVANTE_NAMESPACE("cl.scatter_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE) \ + "_I32_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)), \ + _KERNEL_SOURCE0 } + +#define PACK_KERNEL_ADD_MAP( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS ) \ + { SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, VSI_NN_REDUCTION_TYPE_ADD ), \ + CVIVANTE_NAMESPACE("cl.scatter_elements_add_axis"STR(AXIS)"_"STR(IN0_DTYPE) \ + "_I32_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)), \ + _KERNEL_SOURCE1 } + +#define PACK_KERNEL_MUL_MAP( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS ) \ + { SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, VSI_NN_REDUCTION_TYPE_MUL ), \ + CVIVANTE_NAMESPACE("cl.scatter_elements_mul_axis"STR(AXIS)"_"STR(IN0_DTYPE) \ + "_I32_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)), \ + _KERNEL_SOURCE2 } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + + +#define PACK_KERNELS_MAP(type) \ + PACK_KERNEL_NONE_MAP( type, type, type, 0 ), \ + PACK_KERNEL_NONE_MAP( type, type, type, 1 ), \ + PACK_KERNEL_ADD_MAP( type, type, type, 0 ), \ + PACK_KERNEL_ADD_MAP( type, type, type, 1 ), \ + PACK_KERNEL_MUL_MAP( type, type, type, 0 ), \ + PACK_KERNEL_MUL_MAP( type, type, type, 1 ), \ + PACK_KERNEL_MUL_MAP( type, type, type, 2 ) + +static const _kernel_map_type _scatter_elements_kernel_map[] = +{ + // Register kernel here + PACK_KERNELS_MAP( I8 ), + PACK_KERNELS_MAP( U8 ), + PACK_KERNELS_MAP( I16 ), + PACK_KERNELS_MAP( F16 ), + PACK_KERNELS_MAP( I32 ), + PACK_KERNELS_MAP( F32 ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _scatter_elements_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _SCATTER_ELEMENTS_PARAM_NUM _cnt_of_array( _scatter_elements_kernel_param_def ) +#define SCALAR_INPUT_AXIS (4) +#define SCALAR_INPUT_REDUCTION (5) +#define SCALAR_REF_SCALE (6) +#define SCALAR_REF_TAIL (7) +#define SCALAR_UPDATE_SCALE (8) +#define SCALAR_UPDATE_TAIL (9) +#define SCALAR_OUTPUT_ZP (10) +#define SCALAR_INDICES_INNER (11) +#define SCALAR_INDICES_AXIS (12) +#define SCALAR_INDICES_OUTER (13) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_scatter_elements_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_size_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = out_shape->data[0]; + gpu_param.global_size[1] = out_shape->data[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _scatter_elements_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + int32_t reduction + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e in2_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _scatter_elements_kernel_map; + size_t kernel_map_size = _cnt_of_array( _scatter_elements_kernel_map ); + vx_param_description_t * param_def = _scatter_elements_kernel_param_def; + vx_kernel_initialize_f initializer = _scatter_elements_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (in1_dtype != I32) + { + return VSI_FAILURE; + } + + key = SCATTER_ELEMENTS_HASH_KEY( in0_dtype, in2_dtype, out_dtype, axis, reduction ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _scatter_elements_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SCATTER_ELEMENTS_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* reshape_tensors[4] = { NULL }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; + int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis"); + int32_t reduction = vsi_nn_kernel_param_get_int32(params, "reduction"); + int32_t new_axis0 = 0; + int32_t new_axis1 = 0; + int32_t inner_size = 0; + int32_t axis_size = 0; + int32_t outer_size = 0; + vsi_bool ret = FALSE; + float output_scale = vsi_nn_get_tensor_scale(outputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float input0_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input2_scale = vsi_nn_get_tensor_scale(inputs[2]); + float input2_tail = (float)vsi_nn_get_tensor_zero_point(inputs[2]); + +#define MAX_SHAPE_SIZE (0xFFFFFFFF) + ret = vsi_nn_kernel_optimize_scatter_elements_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rank_in, &new_axis0, MAX_SHAPE_SIZE); + ret &= vsi_nn_kernel_optimize_scatter_elements_shape( + inputs[1]->attr.size, inputs[1]->attr.dim_num, axis, + shapes[1], &rank_in, &new_axis1, MAX_SHAPE_SIZE); +#undef MAX_SHAPE_SIZE + + + if ( ret && new_axis0 == new_axis1 ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], shapes[1], rank_in ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + inputs[2], shapes[1], rank_in ); + reshape_tensors[3] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[0], rank_in ); + + inner_size = new_axis0 == 0 ? 1 : (int32_t)shapes[1][0]; + axis_size = new_axis0 == 0 ? (int32_t)shapes[1][0] : (int32_t)shapes[1][1]; + outer_size = new_axis0 == 0 ? (int32_t)shapes[1][1] : rank_in > 2 ? (int32_t)shapes[1][2] : 1; + } + else + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, axis, reduction ); + if ( VSI_SUCCESS == status) + { + input0_scale = input0_scale / output_scale; + input0_tail = - input0_tail * input0_scale; + input2_scale = input2_scale / output_scale; + input2_tail = - input2_tail * input2_scale; + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SCATTER_ELEMENTS_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[3], output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &new_axis0 ); + node_params[SCALAR_INPUT_REDUCTION] = vsi_nn_kernel_scalar_create(graph, I32, &reduction ); + node_params[SCALAR_REF_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale ); + node_params[SCALAR_REF_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0_tail ); + node_params[SCALAR_UPDATE_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input2_scale ); + node_params[SCALAR_UPDATE_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input2_tail ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(graph, F32, &output_zp ); + node_params[SCALAR_INDICES_INNER] = vsi_nn_kernel_scalar_create(graph, I32, &inner_size ); + node_params[SCALAR_INDICES_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &axis_size ); + node_params[SCALAR_INDICES_OUTER] = vsi_nn_kernel_scalar_create(graph, I32, &outer_size ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ELEMENTS_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_REDUCTION] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_REF_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_REF_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_UPDATE_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_UPDATE_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + } + + vsi_safe_release_tensor( reshape_tensors[0] ); + vsi_safe_release_tensor( reshape_tensors[1] ); + vsi_safe_release_tensor( reshape_tensors[2] ); + vsi_safe_release_tensor( reshape_tensors[3] ); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( scatter_elements, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c b/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c new file mode 100644 index 0000000..b5bfbcb --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c @@ -0,0 +1,229 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.bucketize") + + +/* + * Kernel params + */ +static vx_param_description_t _bucketize_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _BUCKETIZE_PARAM_NUM _cnt_of_array( _bucketize_kernel_param_def ) +#define SCALAR_RIGHT_VALUE (3) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i = 0, j = 0; + int32_t right = 0; + uint32_t boundaries_size = 0; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_RIGHT_VALUE], &(right)); + + boundaries_size = (uint32_t)in_attr[1]->shape->data[0]; + + for (i = 0; i < out_elements[0]; i++) + { + float src0 = f32_in_buffer[0][i]; + float dst = 0; + + for (j = 0; j < boundaries_size; j++) + { + float src1 = f32_in_buffer[1][j]; + + if (right == 1) + { + dst += (src0 >= src1 ? 1.0f : 0.0f); + } + else + { + dst += (src0 > src1 ? 1.0f : 0.0f); + } + } + + f32_out_buffer[0][i] = dst; + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _bucketize_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _bucketize_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t right = vsi_nn_kernel_param_get_int32( params, "right" ); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_RIGHT_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &right ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_RIGHT_VALUE] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( bucketize, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c b/src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c new file mode 100644 index 0000000..0f66636 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c @@ -0,0 +1,264 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.lppool") + + +/* + * Kernel params + */ +static vx_param_description_t _lppool_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _LPPOOL_PARAM_NUM _cnt_of_array( _lppool_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_lppool_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float * buffer[_INPUT_NUM + _OUTPUT_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM + _OUTPUT_NUM] = { NULL }; + int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0; + int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0; + int32_t p = 0; + int32_t i = 0; + input[0] = (vsi_nn_kernel_tensor_t)param[0]; + output[0] = (vsi_nn_kernel_tensor_t)param[1]; + attr[0] = vsi_nn_kernel_tensor_attr_create( input[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( output[0] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &ksize_x); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_y); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &pad_left); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &pad_right); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &pad_top); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_bottom); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &stride_x); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &stride_y); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &p); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( input[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + { + int32_t batch = (int32_t)attr[1]->shape->data[2]; + int32_t height_o = (int32_t)attr[1]->shape->data[1]; + int32_t width_o = (int32_t)attr[1]->shape->data[0]; + int32_t height = (int32_t)attr[0]->shape->data[1]; + int32_t width = (int32_t)attr[0]->shape->data[0]; + int32_t b = 0, j = 0; + int32_t output_base = 0; + int32_t input_base = 0; + float data = 0; + for (b = 0; b < batch; b++) + { + output_base = b * height_o * width_o; + input_base = b * height * width; + for (j = 0; j < height_o; j++) + { + for (i = 0; i < width_o; i++) + { + int32_t hstart = j * stride_y - pad_top; + int32_t wstart = i * stride_x - pad_left; + int32_t hend = vsi_nn_min(hstart + ksize_y, height); + int32_t wend = vsi_nn_min(wstart + ksize_x, width); + int32_t pool_index = output_base + j * width_o + i; + int32_t h = 0, w = 0; + float sum_of_pow = 0; + float out_data = 0; + hstart = vsi_nn_max(hstart, 0); + wstart = vsi_nn_max(wstart, 0); + + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + int32_t index = input_base + h * width + w; + data = buffer[0][index]; + sum_of_pow += (float)pow(fabs(data),p); + } + } + out_data = (float)pow(sum_of_pow, 1.0f / p); + buffer[1][pool_index] = out_data; + } + } + } + + } + status = vsi_nn_kernel_tensor_write_from_float( output[0], attr[1], + buffer[1], out_elements ); +final: + for ( i = 0; i < _INPUT_NUM + _OUTPUT_NUM; i ++ ) + { + vsi_nn_safe_free( buffer[i] ); + if (attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + } + + return status; +} /* _lppool_exec() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _lppool_exec; + kernel->info.parameters = _lppool_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _lppool_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_LPPOOL_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + + int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x"); + int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y"); + int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x"); + int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y"); + int32_t pad_left = vsi_nn_kernel_param_get_int32(params, "pad_left"); + int32_t pad_right = vsi_nn_kernel_param_get_int32(params, "pad_right"); + int32_t pad_top = vsi_nn_kernel_param_get_int32(params, "pad_top"); + int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom"); + int32_t p = vsi_nn_kernel_param_get_int32(params, "p"); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + int32_t index = 2; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _LPPOOL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &p ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _LPPOOL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( lppool, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c index e109349..183fedc 100644 --- a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c @@ -56,14 +56,26 @@ static vsi_ssize_t _expand_offset vsi_size_t i; vsi_ssize_t offset = 0; - for( i = 0; i < rank && index; i ++ ) + for ( i = 0; i < rank && index; i ++ ) { - if( shape[i] == out_shape[i] ) + if (strides[0] == 0) + { + if (i == 0) + { + offset += (index % out_shape[0]); + } + else + { + offset += (vsi_ssize_t)strides[i] * 2 * ( index % out_shape[i] ); + } + } + else if ( shape[i] == out_shape[i] ) { offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } + return offset; } diff --git a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c index 61d94c6..7cb6630 100644 --- a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c @@ -52,14 +52,26 @@ static vsi_ssize_t _expand_offset vsi_size_t i; vsi_ssize_t offset = 0; - for( i = 0; i < rank && index; i ++ ) + for ( i = 0; i < rank && index; i ++ ) { - if( shape[i] == out_shape[i] ) + if (strides[0] == 0) + { + if (i == 0) + { + offset += (index % out_shape[0]); + } + else + { + offset += (vsi_ssize_t)strides[i] * 2 * ( index % out_shape[i] ); + } + } + else if ( shape[i] == out_shape[i] ) { offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } + return offset; } diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c new file mode 100644 index 0000000..189ef8f --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c @@ -0,0 +1,405 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (11) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.pre_process_yuv422_sw") + +#define DESCALE(x) (((x) + (1<<19)) >> 20) + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + + +DEF_KERNEL_EXECUTOR(_pre_process_yuv422_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + float * outBuffer = NULL; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0; + float rMean = 0, gMean = 0, bMean = 0, var = 0; + int32_t order = 0, trans = 0, yuv422_type = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + i = 2; + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yuv422_type); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + if(trans) + { + outBuffer = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final ); + memset( outBuffer, 0, out_elements * sizeof(float) ); + } + + { + int32_t dx, dy, dz; + int32_t src_width = (int32_t)attr[0]->shape->data[0]; + int32_t dst_width = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]); + int32_t dst_height = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[1]); + int32_t stride = (int32_t)(dst_width * dst_height); + int32_t rOffset = 0; + int32_t gOffset = 1 * stride; + int32_t bOffset = 2 * stride; + float D0, D1, E0, E1; + float R0, G0, B0, R1, G1, B1; + float min = 0; + float max = 255; + float* src_y_slice = NULL; + + uint32_t roi_width = (xRatio * dst_width) >> 15; + uint32_t roi_height = (yRatio * dst_height) >> 15; + uint32_t xrIntFloat_16 = (roi_width << 16) / dst_width + 1; + uint32_t yrIntFloat_16 = (roi_height << 16) / dst_height + 1; + uint32_t srcy = 0, srcx = 0; + + if(attr[1]->dtype == I8) + { + min = -128; + max = 127; + } + else if(attr[1]->dtype == I16 || attr[1]->dtype == F16) + { + min = -32768; + max = 32767; + } + + if(order) + { + rOffset = 2 * stride; + bOffset = 0; + } + + for ( dz = 0; dz < 1; dz ++) + { + for ( dy = 0; dy < (int32_t)dst_height; dy++) + { + srcy = (((uint32_t)dy * yrIntFloat_16) >> 16) + yOffset; + src_y_slice = buffer[0] + (srcy) * src_width; + for ( dx = 0; dx < (int32_t)dst_width; dx += 2) + { + int32_t output_index = 0; + int32_t dstR_idx = 0, dstG_idx = 0, dstB_idx = 0; + float tmpY0 = 0.0f; + float tmpY1 = 0.0f; + float tmpU0 = 0.0f; + float tmpU1 = 0.0f; + float tmpV0 = 0.0f; + float tmpV1 = 0.0f; + + srcx = ((((uint32_t)dx * xrIntFloat_16) >> 16) + xOffset) * 2; + + if (xrIntFloat_16 >> 16 == 1) + { + if (yuv422_type == 1) + { + tmpY0 = src_y_slice[srcx + 1]; + tmpU0 = src_y_slice[srcx]; + tmpY1 = src_y_slice[srcx + 3]; + tmpV0 = src_y_slice[srcx + 2]; + tmpU1 = tmpU0; + tmpV1 = tmpV0; + } + else + { + tmpY0 = src_y_slice[srcx]; + tmpU0 = src_y_slice[srcx + 1]; + tmpY1 = src_y_slice[srcx + 2]; + tmpV0 = src_y_slice[srcx + 3]; + tmpU1 = tmpU0; + tmpV1 = tmpV0; + } + } + else + { + if (yuv422_type == 1) + { + tmpY0 = src_y_slice[srcx + 1]; + tmpU0 = src_y_slice[(srcx / 4) * 4]; + tmpV0 = src_y_slice[(srcx / 4) * 4 + 2]; + srcx = (((uint32_t)(dx + 1) * xrIntFloat_16) >> 16) + xOffset; + srcx = srcx * 2; + tmpY1 = src_y_slice[srcx + 1]; + tmpU1 = src_y_slice[(srcx / 4) * 4]; + tmpV1 = src_y_slice[(srcx / 4) * 4 + 2]; + } + else + { + tmpY0 = src_y_slice[srcx]; + tmpU0 = src_y_slice[(srcx / 4) * 4 + 1]; + tmpV0 = src_y_slice[(srcx / 4) * 4 + 3]; + srcx = (((uint32_t)(dx + 1) * xrIntFloat_16) >> 16) + xOffset; + srcx = srcx * 2; + tmpY1 = src_y_slice[srcx]; + tmpU1 = src_y_slice[(srcx / 4) * 4 + 1]; + tmpV1 = src_y_slice[(srcx / 4) * 4 + 3]; + } + } + + D0 = (tmpU0 - 128); + E0 = (tmpV0 - 128); + D1 = (tmpU1 - 128); + E1 = (tmpV1 - 128); + + B0 = (float)vsi_clamp((tmpY0 + (1.7790 * D0)), min, max); + G0 = (float)vsi_clamp((tmpY0 - 0.3455 * D0 - 0.7169 * E0), min, max); + R0 = (float)vsi_clamp((tmpY0 + 1.4065 * E0), min, max); + + B1 = (float)vsi_clamp((tmpY1 + (1.7790 * D1)), min, max); + G1 = (float)vsi_clamp((tmpY1 - 0.3455 * D1 - 0.7169 * E1), min, max); + R1 = (float)vsi_clamp((tmpY1 + 1.4065 * E1), min, max); + + output_index = dx + dy * dst_width; + + dstR_idx = output_index + rOffset; + dstG_idx = output_index + gOffset; + dstB_idx = output_index + bOffset; + + buffer[1][dstB_idx] = (B0 - bMean) * var; + buffer[1][dstG_idx] = (G0 - gMean) * var; + buffer[1][dstR_idx] = (R0 - rMean) * var; + + dstR_idx += 1; + dstG_idx += 1; + dstB_idx += 1; + + buffer[1][dstB_idx] = (B1 - bMean) * var; + buffer[1][dstG_idx] = (G1 - gMean) * var; + buffer[1][dstR_idx] = (R1 - rMean) * var; + } + } + } + } + + if(trans) + { + vsi_size_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1}; + vsi_size_t perm[] = {1, 2, 0, 3}; + vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1], + shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32); + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + outBuffer, out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + else + { + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + if(outBuffer) + { + free(outBuffer); + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _pre_process_yuv422_exec() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _pre_process_yuv422_exec; + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CPU_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + status = _query_kernel( kernel, inputs, outputs); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 2; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t trans = vsi_nn_kernel_param_get_int32( params, "enable_perm" ); + int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( pre_process_yuv422, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c index 82e9c1a..071e5e7 100644 --- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c @@ -73,7 +73,7 @@ static float _compute_region_coordinate(int32_t p, float bin_size, float roi_anc { const float region_start = p * bin_size + roi_anchor; - return vsi_nn_clamp(region_start, 0.0f, max_value - 1); + return region_start; } static float _roi_align_1x1(float *input_ptr, @@ -88,53 +88,64 @@ static float _roi_align_1x1(float *input_ptr, int32_t grid_size_y, float region_end_y) { - if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + float avg = 0; + int32_t iy = 0; + int32_t ix = 0; + // Iterate through the aligned pooling region + for (iy = 0; iy < grid_size_y; ++iy) { - return 0; - } - else - { - float avg = 0; - int32_t iy = 0; - int32_t ix = 0; - // Iterate through the aligned pooling region - for (iy = 0; iy < grid_size_y; ++iy) + for (ix = 0; ix < grid_size_x; ++ix) { - for (ix = 0; ix < grid_size_x; ++ix) - { - // Align the window in the middle of every bin - float y = region_start_y + ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y); - float x = region_start_x + ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x); + // Align the window in the middle of every bin + float y = region_start_y + + ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y); + float x = region_start_x + + ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x); - // Interpolation in the [0,0] [0,1] [1,0] [1,1] square - const int32_t y_low = (int32_t)y; - const int32_t x_low = (int32_t)x; - const int32_t y_high = vsi_nn_min(y_low + 1, height - 1); - const int32_t x_high = vsi_nn_min(x_low + 1, width - 1); + // Interpolation in the [0,0] [0,1] [1,0] [1,1] square + const int32_t y_low = vsi_nn_min((int32_t)y, height - 1); + const int32_t x_low = vsi_nn_min((int32_t)x, width - 1); + const int32_t y_high = vsi_nn_min(y_low + 1, height - 1); + const int32_t x_high = vsi_nn_min(x_low + 1, width - 1); - const float ly = y - y_low; - const float lx = x - x_low; - const float hy = 1.0f - ly; - const float hx = 1.0f - lx; + float ly = y - y_low; + float lx = x - x_low; + float hy = 1.0f - ly; + float hx = 1.0f - lx; - const float w1 = hy * hx; - const float w2 = hy * lx; - const float w3 = ly * hx; - const float w4 = ly * lx; + float w1 = hy * hx; + float w2 = hy * lx; + float w3 = ly * hx; + float w4 = ly * lx; - const float data1 = *(input_ptr + y_low * width + x_low); - const float data2 = *(input_ptr + y_low * width + x_high); - const float data3 = *(input_ptr + y_high * width + x_low); - const float data4 = *(input_ptr + y_high * width + x_high); + const float data1 = *(input_ptr + y_low * width + x_low); + const float data2 = *(input_ptr + y_low * width + x_high); + const float data3 = *(input_ptr + y_high * width + x_low); + const float data4 = *(input_ptr + y_high * width + x_high); - avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; - } + /* onnx: inverse elements are out of feature map boundary */ + if (x > width || x < -1 || y > height || y < -1) continue; + + x = x_low >= width - 1 ? x_low : x; + y = y_low >= height - 1 ? y_low : y; + + ly = y - y_low; + lx = x - x_low; + hy = 1.0f - ly; + hx = 1.0f - lx; + + w1 = hy * hx; + w2 = hy * lx; + w3 = ly * hx; + w4 = ly * lx; + + avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } - - avg /= grid_size_x * grid_size_y; - - return avg; } + + avg /= grid_size_x * grid_size_y; + + return avg; } DEF_KERNEL_EXECUTOR(_compute) diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c new file mode 100644 index 0000000..b3cfbbc --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c @@ -0,0 +1,258 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _ARG_NUM (2) +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _CPU_PARAM_NUM (_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.gather_elements") + + +/* + * Kernel params + */ +static vx_param_description_t _scatter_elements_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _SCATTER_ELEMENTS_PARAM_NUM _cnt_of_array( _scatter_elements_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[3] = { NULL }; + int32_t* buffer_idx = NULL; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + vsi_size_t a = 0; + vsi_size_t o = 0; + vsi_size_t i = 0; + vsi_size_t outer_size[2] = {1, 1}; + vsi_size_t inner_size[2] = {1, 1}; + vsi_size_t axis_size[2] = {1, 1}; + int32_t axis = 0; + int32_t reduction = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &reduction); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer_idx = (int32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input0 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memcpy( buffer[2], buffer[0], out_elements * sizeof(float) ); + + axis_size[0] = attr[0]->shape->data[axis]; + axis_size[1] = attr[1]->shape->data[axis]; + for (i = 0; i < (vsi_size_t)axis; ++i) + { + inner_size[0] *= attr[0]->shape->data[i]; + inner_size[1] *= attr[1]->shape->data[i]; + } + + for (i = axis + 1; i < attr[1]->shape->size; ++i) + { + outer_size[0] *= attr[0]->shape->data[i]; + outer_size[1] *= attr[1]->shape->data[i]; + } + + for (o = 0; o < outer_size[1]; o++) + { + for (a = 0; a < axis_size[1]; a++) + { + for (i = 0; i < inner_size[1]; i++) + { + vsi_ssize_t index = 0; + vsi_size_t index0 = (o * axis_size[1] + a) * inner_size[1] + i; + vsi_size_t index1 = 1; + + index = (vsi_ssize_t)buffer_idx[index0]; + index1 = (o * axis_size[0] + index) * inner_size[0] + i; + + switch (reduction) + { + case VSI_NN_REDUCTION_TYPE_NONE: + buffer[2][index1] = buffer[1][index0]; + break; + case VSI_NN_REDUCTION_TYPE_ADD: + buffer[2][index1] += buffer[1][index0]; + break; + case VSI_NN_REDUCTION_TYPE_MUL: + buffer[2][index1] *= buffer[1][index0]; + break; + default: + break; + } + + + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); +final: + if ( buffer_idx ) + { + free( buffer_idx ); + } + for ( i = 0; i < 3; i ++ ) + { + if ( buffer[i] ) + { + free( buffer[i] ); + } + } + for ( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_SUCCESS; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _scatter_elements_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _scatter_elements_kernel_param_def ); + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SCATTER_ELEMENTS_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" ); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SCATTER_ELEMENTS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &axis ); + node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &reduction ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ELEMENTS_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( scatter_elements, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c new file mode 100644 index 0000000..d7074c3 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c @@ -0,0 +1,323 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_BUCKETIZE, +} _internal_kernel_e; + +#define STR(a) #a + +// Add kernel hashtable here +#define BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \ + (( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 ) | (RIGHT << 24) | (IMG_2D << 25)) + +#define PACK_KERNEL_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \ + { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \ + CVIVANTE_NAMESPACE("evis.bucketize_right_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + "bucketize" } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _bucketize_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_2D_MAP( F16, F16, I32, 1, 1 ), + PACK_KERNEL_2D_MAP( I16, I16, I32, 1, 1 ), + PACK_KERNEL_2D_MAP( U8, U8, I32, 1, 1 ), + PACK_KERNEL_2D_MAP( I8, I8, I32, 1, 1 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _bucketize_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _BUCKETIZE_PARAM_NUM _cnt_of_array( _bucketize_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_bucketize_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * input0_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input1_attr = NULL; + vsi_size_array_t * input0_shape = NULL; + vsi_size_array_t * input1_shape = NULL; + + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input0_attr, "Create tensor attr buffer fail.", final ); + input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( input1_attr, "Create tensor attr buffer fail.", final ); + + input0_shape = input0_attr->shape; + input1_shape = input1_attr->shape; + + gpu_param.dim = 2; + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (input0_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (input0_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + + { + gpu_dp_inst_t uniDataConvert_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDataConvert_1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + int32_t boundaries_size = (int32_t)input1_shape->data[0]; + int32_t boundaries_size_x8 = (boundaries_size / 8) * 8; + + status = vsi_nn_kernel_gpu_add_param( node, "uniDataConvert_0_4x4", &uniDataConvert_0_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDataConvert_1_4x4", &uniDataConvert_1_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "boundaries_size_x8", &boundaries_size_x8); + status |= vsi_nn_kernel_gpu_add_param( node, "boundaries_size", &boundaries_size); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(input0_attr); + SAFE_FREE_TENSOR_ATTR(input1_attr); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _bucketize_initializer() */ + +static vsi_bool _bucketize_support_types + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * boundaries, + int32_t right + ) +{ + vsi_size_t width = input->attr.size[0]; + vsi_size_t height = input->attr.size[1]; + vsi_size_t boundaries_size = boundaries->attr.size[0]; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_dtype_e in_dtype = vsi_nn_kernel_map_dtype( input->attr.dtype.vx_type ); + + image_2d = (input->attr.dim_num == 2 || input->attr.size[2] == 1); + + if ( vsi_nn_is_same_type(input, boundaries) == FALSE || right == 0 || image_2d == FALSE ) + { + return FALSE; + } + + if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2) + { + return FALSE; + } + +#define MAX_16BITS_BOUNDARIES_SIZE (0xFFFF) + if ( (in_dtype == F16 || in_dtype == I16) && boundaries_size > MAX_16BITS_BOUNDARIES_SIZE ) + { + return FALSE; + } +#undef MAX_16BITS_BOUNDARIES_SIZE + +#define MAX_8BITS_BOUNDARIES_SIZE (0xFF) + if ( (in_dtype == I8 || in_dtype == U8) && boundaries_size > MAX_8BITS_BOUNDARIES_SIZE ) + { + return FALSE; + } +#undef MAX_8BITS_BOUNDARIES_SIZE + +#define INPUT_SIZE_ALIGN8 (8) + if ( width % INPUT_SIZE_ALIGN8 != 0 && height != 1 ) + { + return FALSE; + } +#undef INPUT_SIZE_ALIGN8 + + return TRUE; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t right + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _bucketize_kernel_map; + size_t kernel_map_size = _cnt_of_array( _bucketize_kernel_map ); + vx_param_description_t * param_def = _bucketize_kernel_param_def; + vx_kernel_initialize_f initializer = _bucketize_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = BUCKETIZE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, right, 1 ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _bucketize_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t right = vsi_nn_kernel_param_get_int32( params, "right" ); + + if( !vsi_nn_kernel_gpu_check_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + if ( _bucketize_support_types(graph, inputs[0], inputs[1], right) == FALSE ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, right ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( bucketize, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c index 510069b..be2db5e 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c @@ -158,7 +158,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer) if (srcFixPointPos >= 0) output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos); else if (srcFixPointPos < 0) - output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos); + output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - srcFixPointPos); } else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant ) { diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c index f641e10..af31e07 100644 --- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c @@ -47,7 +47,8 @@ __BEGIN_DECLS typedef enum { INTERNAL_KERNEL_SUMS, - INTERNAL_KERNEL_NORM, + INTERNAL_KERNEL_MEANS, + INTERNAL_KERNEL_NORMS, } _internal_kernel_e; #define KERNEL_SOURCE_0 "instance_normalization_0" @@ -61,6 +62,9 @@ typedef enum #define HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(SRC0_TYPE) \ CVIVANTE_NAMESPACE("evis.instance_norm_sums_"#SRC0_TYPE"_2D") +#define HASH_INSTANCENORM_MEANS_SH_KERNEL_NAME() \ + CVIVANTE_NAMESPACE("evis.instance_norm_means") + #define HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE) @@ -68,8 +72,8 @@ typedef enum CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE"_2D") // Add kernel hashtable here -#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \ - ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) +#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _img_2d) \ + ((_input0_type << 24) | (_output_type << 16) | (_img_2d << 8)) #define TENSOR_INSTANCENORM_SUMS_KERNELS_3D(IN0_TYPE, OUT_TYPE, SOURCE) \ { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \ @@ -81,6 +85,14 @@ typedef enum HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(IN0_TYPE), \ SOURCE }, +#define HASH_INSTANCENORM_MEANS_KEY(ALPHA_TYPE, BETA_TYPE) \ + ((F32 << 24) | (ALPHA_TYPE << 16) | (BETA_TYPE << 8) | (F32)) + +#define TENSOR_INSTANCENORM_MEANS_KERNELS(ALPHA_TYPE, BETA_TYPE) \ + { HASH_INSTANCENORM_MEANS_KEY(ALPHA_TYPE, BETA_TYPE), \ + HASH_INSTANCENORM_MEANS_SH_KERNEL_NAME(), \ + KERNEL_SOURCE_0 }, + // normalization #define HASH_INSTANCENORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \ ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4)) @@ -117,6 +129,13 @@ static const _kernel_map_type _instancenorm_sums_kernel_map[] = TENSOR_INSTANCENORM_SUMS_KERNELS_2D( BF16, F32, KERNEL_SOURCE_3 ) }; +static const _kernel_map_type _instancenorm_means_kernel_map[] = +{ + // Register kernel here + TENSOR_INSTANCENORM_MEANS_KERNELS( F32, F32 ) +}; + + static const _kernel_map_type _instancenorm_kernel_map[] = { // Register kernel here @@ -162,15 +181,36 @@ static vx_param_description_t _instancenorm_sums_kernel_param_def[] = }; #define _INSTANCENORM_SUMS_PARAM_NUM _cnt_of_array( _instancenorm_sums_kernel_param_def ) -static vx_param_description_t _instancenorm_kernel_param_def[] = +static vx_param_description_t _instancenorm_means_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _INSTANCENORM_MEANS_PARAM_NUM _cnt_of_array( _instancenorm_means_kernel_param_def ) +#define MEANS_EPS_SCL (4) +#define MEANS_INPUT_SCALE_SCL (5) +#define MEANS_INPUT_ZP_SCL (6) +#define MEANS_OUTPUT_SCALE_SCL (7) +#define MEANS_OUTPUT_ZP_SCL (8) +#define MEANS_INV_MULTIPLIER_SCL (9) +#define MEANS_GROUP_NUM_SCL (10) + +static vx_param_description_t _instancenorm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; #define _INSTANCENORM_PARAM_NUM _cnt_of_array( _instancenorm_kernel_param_def ) @@ -195,7 +235,6 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer) vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; vsi_size_array_t * input_shape = NULL; - int32_t rs_flag = 0; int32_t width = 0; int32_t height = 0; int32_t chn = 0; @@ -212,7 +251,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer) attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &rs_flag); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &height); CHECK_STATUS_FAIL_GOTO(status, OnError ); input_shape = attr[0]->shape; @@ -221,12 +260,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer) input_zp = (float)attr[0]->zero_point; width = (int32_t)(input_shape->data[0]); - height = (int32_t)(input_shape->data[1]); chn = (int32_t)(attr[1]->shape->data[1]); - if (rs_flag) - { - height = height / chn; - } work_item_pixels = (float)height * 16; @@ -333,7 +367,6 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer) } status = vsi_nn_kernel_gpu_add_param(node, "width", &width); - status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); CHECK_STATUS_FAIL_GOTO(status, OnError ); OnError: @@ -351,6 +384,55 @@ OnError: return status; } +DEF_KERNEL_INITIALIZER(_instancenorm_means_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + vsi_size_array_t * input_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + input_shape = attr[0]->shape; + + shaderParam.global_scale[0] = 1; + shaderParam.global_scale[1] = 1; + + shaderParam.global_size[0] = 1; + shaderParam.global_size[1] = input_shape->data[1]; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} + DEF_KERNEL_INITIALIZER(_instancenorm_initializer) ( vsi_nn_kernel_node_t node, @@ -366,52 +448,26 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL}; + vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL, NULL}; vsi_size_array_t * input_shape = NULL; - float input_scale = 1; - float output_scale = 1; - float input_zp = 0; - float output_zp = 0; - float inv_multiplier = 0; - vx_uint32 group_num = 0; - vx_int32 height = 0, width = 0, chn = 0; - int32_t rs_flag = 0; + vx_int32 width = 0, chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); - attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); - CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); - attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); - CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError ); - - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rs_flag); - CHECK_STATUS_FAIL_GOTO(status, OnError ); input_shape = attr[0]->shape; - input_scale = attr[0]->scale; - input_zp = (float)attr[0]->zero_point; - output_scale = 1.0f / attr[3]->scale; - output_zp = (float)attr[3]->zero_point; width = (int32_t)(input_shape->data[0]); - height = (int32_t)(input_shape->data[1]); - chn = (int32_t)(attr[2]->shape->data[1]); - if (rs_flag) - { - height = height / chn; - } - - inv_multiplier = (float)(1.0 / (width * height)); - - group_num = (width + 255) / 256; + chn = (int32_t)(attr[1]->shape->data[1]); shaderParam.global_scale[0] = 16; if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16) { shaderParam.global_scale[0] = 8; - group_num = (width + 127) / 128; } shaderParam.global_scale[1] = 1; @@ -521,12 +577,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ (IN0_TYPE | (OUT_TYPE << 16)) - pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[3]->dtype ); - - status = vsi_nn_kernel_gpu_add_param(node, "height", &height); - status |= vsi_nn_kernel_gpu_add_param(node, "inv_multiplier", &inv_multiplier); - status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num); - CHECK_STATUS_FAIL_GOTO(status, OnError ); + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype ); switch( pack_key ) { @@ -535,7 +586,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) case _PACK_SELECT_KEY( U8, U8 ): case _PACK_SELECT_KEY( I8, I8 ): { - if (attr[3]->dtype == F16) + if (attr[2]->dtype == F16) { status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); @@ -544,11 +595,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) { status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); - status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); } - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); - status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", &uniDataToFP32_0_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", @@ -567,7 +614,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) case _PACK_SELECT_KEY( F16, U8 ): case _PACK_SELECT_KEY( F16, I8 ): { - if (attr[3]->dtype == F16) + if (attr[2]->dtype == F16) { status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); @@ -577,14 +624,10 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); } - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); - status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", &uniDataToFP32_0_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", &uniDataToFP32_1_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); - status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; @@ -612,21 +655,18 @@ OnError: vsi_nn_kernel_tensor_attr_release( &attr[0] ); attr[0] = NULL; } + if (attr[1]) { vsi_nn_kernel_tensor_attr_release( &attr[1] ); attr[1] = NULL; } + if (attr[2]) { vsi_nn_kernel_tensor_attr_release( &attr[2] ); attr[2] = NULL; } - if (attr[3]) - { - vsi_nn_kernel_tensor_attr_release( &attr[3] ); - attr[3] = NULL; - } return status; } @@ -637,7 +677,9 @@ OnError: static vsi_status _query_kernel ( vsi_nn_kernel_t * kernel, - const uint32_t hashkey, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool img_2d, _internal_kernel_e kernel_id /* Add extra params */ ) @@ -649,6 +691,18 @@ static vsi_status _query_kernel size_t kernel_map_size = 0; size_t param_size = 0; uint32_t i = 0; + uint32_t hashkey = 0; + vsi_nn_kernel_dtype_e in0_dtype = U8; + vsi_nn_kernel_dtype_e in1_dtype = F16; + vsi_nn_kernel_dtype_e in2_dtype = F16; + vsi_nn_kernel_dtype_e out_dtype = U8; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + in1_dtype = in1_dtype == F16 ? F32 : in1_dtype; + in2_dtype = in2_dtype == F16 ? F32 : in2_dtype; switch ( kernel_id ) { @@ -658,13 +712,23 @@ static vsi_status _query_kernel kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map ); param_def = _instancenorm_sums_kernel_param_def; param_size = _INSTANCENORM_SUMS_PARAM_NUM; + hashkey = HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, img_2d ); break; - case INTERNAL_KERNEL_NORM: + case INTERNAL_KERNEL_MEANS: + initializer = _instancenorm_means_initializer; + kernel_map = _instancenorm_means_kernel_map; + kernel_map_size = _cnt_of_array( _instancenorm_means_kernel_map ); + param_def = _instancenorm_means_kernel_param_def; + param_size = _INSTANCENORM_MEANS_PARAM_NUM; + hashkey = HASH_INSTANCENORM_MEANS_KEY( in1_dtype, in2_dtype ); + break; + case INTERNAL_KERNEL_NORMS: initializer = _instancenorm_initializer; kernel_map = _instancenorm_kernel_map; kernel_map_size = _cnt_of_array( _instancenorm_kernel_map ); param_def = _instancenorm_kernel_param_def; param_size = _INSTANCENORM_PARAM_NUM; + hashkey = HASH_INSTANCENORM_KEY( in0_dtype, F32, out_dtype, img_2d ); break; default: VSI_ASSERT( FALSE ); @@ -709,23 +773,21 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * kernel ) { -#define INTERNAL_KERNEL_SIZE (1) -#define MEAN_VARI_INDEX (0) +#define INTERNAL_KERNEL_SIZE (2) +#define SUMS_INDEX (0) +#define MEANS_INDEX (1) vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t means_node_params[_INSTANCENORM_MEANS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL }; - vsi_nn_kernel_node_t tmp_node = NULL; - vsi_nn_kernel_node_t node = NULL; - vsi_nn_kernel_dtype_e in0_dtype = U8; - vsi_nn_kernel_dtype_e in1_dtype = F16; - vsi_nn_kernel_dtype_e out_dtype = U8; + vsi_nn_kernel_node_t sums_node = NULL; + vsi_nn_kernel_node_t means_node = NULL; + vsi_nn_kernel_node_t norms_node = NULL; vsi_nn_tensor_attr_t attr; vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; - uint32_t hashkey = 0; int32_t i = 0; int32_t axis[VSI_NN_MAX_DIM_NUM] = {0, 1}; int32_t axis_num = 2; @@ -735,35 +797,47 @@ static vsi_nn_kernel_node_t _setup uint32_t rank = outputs[0]->attr.dim_num; vsi_nn_tensor_t *reshape_tensor[2] = {NULL}; float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float in_time_out_scale = vsi_nn_get_tensor_scale(inputs[0]) * output_scale; + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inv_multiplier = 1.0f / (float)(inputs[0]->attr.size[0] * inputs[0]->attr.size[1]); + int32_t height = 0; + int32_t group_num = 0; int32_t reshape_flg = 0; vsi_size_t batch = 1; vsi_bool ret = FALSE; - ret = vsi_nn_kernel_optimize_tensor_shape( - inputs[0]->attr.size, inputs[0]->attr.dim_num, - axis, axis_num, new_shape, &rank, new_axis, &axis_size); - if ( ret == FALSE || axis_size > 2 ) - { - return NULL; - } + memcpy(new_shape, inputs[0]->attr.size, sizeof(inputs[0]->attr.size)); - for (i = 3; i < (int32_t)inputs[0]->attr.dim_num; i++) + if (new_shape[0] >= GPU_TENSOR_MAX_WIDTH || new_shape[1] >= GPU_TENSOR_MAX_WIDTH) { - batch *= inputs[0]->attr.size[i]; - } - - if (axis_size == 1) - { - for (i = rank; i > 1; i--) + ret = vsi_nn_kernel_optimize_tensor_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + axis, axis_num, new_shape, &rank, new_axis, &axis_size); + if ( ret == FALSE || axis_size > 2 ) { - new_shape[i] = new_shape[i - 1]; + return NULL; } - new_shape[1] = 1; - rank ++; + + for (i = 3; i < (int32_t)inputs[0]->attr.dim_num; i++) + { + batch *= inputs[0]->attr.size[i]; + } + + if (axis_size == 1) + { + for (i = rank; i > 1; i--) + { + new_shape[i] = new_shape[i - 1]; + } + new_shape[1] = 1; + rank ++; + } + new_shape[2] = rank == 2 ? 1 : new_shape[2] / batch; + new_shape[3] = batch; + rank = 4; } - new_shape[2] = rank == 2 ? 1 : new_shape[2] / batch; - new_shape[3] = batch; - rank = 4; reshape_tensor[0] = vsi_nn_reshape_tensor( graph, inputs[0], new_shape, rank ); @@ -786,24 +860,7 @@ static vsi_nn_kernel_node_t _setup ikernels[i]->unique_id = kernel->unique_id; } - in0_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[0]->attr.dtype.vx_type ); - in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); - out_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[1]->attr.dtype.vx_type ); - in1_dtype = in1_dtype == F16 ? F32 : in1_dtype; - - hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg ); - hashkey = HASH_INSTANCENORM_KEY( in0_dtype, in1_dtype, out_dtype, reshape_flg ); - - status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_SUMS ); - if ( VSI_SUCCESS != status ) - { - goto final; - } - status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM ); - if ( VSI_SUCCESS != status ) - { - goto final; - } + height = (int32_t)new_shape[1]; if (reshape_flg) { @@ -816,6 +873,8 @@ static vsi_nn_kernel_node_t _setup } else if (new_shape[0] < new_shape[1]) { + height = (int32_t)new_shape[0]; + shape[0] = new_shape[1]; shape[1] = new_shape[0]; shape[2] = new_shape[2]; @@ -835,78 +894,121 @@ static vsi_nn_kernel_node_t _setup attr.is_const = FALSE; attr.vtl = TRUE; attr.size[0] = ((shape[0] + 255) / 256) * 4; + group_num = gpu_align_np2_safe((int32_t)shape[0], 256) / 256; if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16) { + group_num = gpu_align_np2_safe((int32_t)shape[0], 128) / 128; attr.size[0] = ((shape[0] + 127) / 128) * 4; } attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; attr.size[2] = 1; attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; attr.dim_num = 4; - tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + tensors[SUMS_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + attr.size[0] = 4; + tensors[MEANS_INDEX] = vsi_nn_CreateTensor( graph, &attr ); shape[0] = 1; shape[1] = rank > 2 ? new_shape[2] : 1; rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 2 ); rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 2 ); - // Mean Vari + /* x0 = sum(x) and x1 = sum(x * x) */ + status = _query_kernel( ikernels[SUMS_INDEX], inputs, outputs, reshape_flg, INTERNAL_KERNEL_SUMS ); + if ( VSI_SUCCESS != status ) { - tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] ); - if (tmp_node) + goto final; + } + + sums_node = vsi_nn_kernel_create_node( graph, ikernels[SUMS_INDEX] ); + if (sums_node) + { + uint32_t index = 0; + + + sums_node_params[index++] = rs_input; + vsi_nn_kernel_node_pack_io( &sums_node_params[index], + _INSTANCENORM_SUMS_PARAM_NUM, NULL, 0, tensors, 1 ); + index = 2; + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + + status = vsi_nn_kernel_node_pass_param( sums_node, sums_node_params, + _INSTANCENORM_SUMS_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &sums_node_params[2] ); + vsi_nn_kernel_scalar_release( &sums_node_params[3] ); { - uint32_t index = 0; + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; - sums_node_params[index++] = rs_input; - vsi_nn_kernel_node_pack_io( &sums_node_params[index], - _INSTANCENORM_SUMS_PARAM_NUM, NULL, 0, tensors, 1 ); - index = 2; - sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); - sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); + vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype); - status = vsi_nn_kernel_node_pass_param( tmp_node, sums_node_params, - _INSTANCENORM_SUMS_PARAM_NUM ); + status = vxSetNodeAttribute( (vx_node)sums_node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); - vsi_nn_kernel_scalar_release( &sums_node_params[2] ); - vsi_nn_kernel_scalar_release( &sums_node_params[3] ); - { - // Set default border mode. - vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - - vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype); - - status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); - CHECK_STATUS(status); - } } } - // Nomalization + /* a = input_scale * output_scale * alpha * mean + b = (beta - scale * mean) * output_scale + output_zp - input * alpha */ + status = _query_kernel( ikernels[MEANS_INDEX], inputs, outputs, reshape_flg, INTERNAL_KERNEL_MEANS ); + if ( VSI_SUCCESS != status ) { - node = vsi_nn_kernel_create_node( graph, kernel ); - if (node) - { - uint32_t index = 0; - node_params[index++] = rs_input; - node_params[index++] = rs_beta; - node_params[index++] = rs_gamma; - node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; - node_params[index++] = rs_output; - node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); - - status = vsi_nn_kernel_node_pass_param( node, node_params, - _INSTANCENORM_PARAM_NUM ); - CHECK_STATUS(status); - vsi_nn_kernel_scalar_release( &node_params[5] ); - vsi_nn_kernel_scalar_release( &node_params[6] ); - } + goto final; + } + + means_node = vsi_nn_kernel_create_node( graph, ikernels[MEANS_INDEX] ); + if (means_node) + { + means_node_params[0] = tensors[SUMS_INDEX]->t; + means_node_params[1] = rs_beta; + means_node_params[2] = rs_gamma; + means_node_params[3] = tensors[MEANS_INDEX]->t; + + means_node_params[MEANS_EPS_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + means_node_params[MEANS_INPUT_SCALE_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &in_time_out_scale ); + means_node_params[MEANS_INPUT_ZP_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp ); + means_node_params[MEANS_OUTPUT_SCALE_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + means_node_params[MEANS_OUTPUT_ZP_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); + means_node_params[MEANS_INV_MULTIPLIER_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &inv_multiplier ); + means_node_params[MEANS_GROUP_NUM_SCL] = vsi_nn_kernel_scalar_create( graph, I32, &group_num ); + + status = vsi_nn_kernel_node_pass_param( means_node, means_node_params, + _INSTANCENORM_MEANS_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &means_node_params[MEANS_EPS_SCL] ); + vsi_nn_kernel_scalar_release( &means_node_params[MEANS_INPUT_SCALE_SCL] ); + vsi_nn_kernel_scalar_release( &means_node_params[MEANS_INPUT_ZP_SCL] ); + vsi_nn_kernel_scalar_release( &means_node_params[MEANS_OUTPUT_SCALE_SCL] ); + vsi_nn_kernel_scalar_release( &means_node_params[MEANS_OUTPUT_ZP_SCL] ); + vsi_nn_kernel_scalar_release( &means_node_params[MEANS_INV_MULTIPLIER_SCL] ); + vsi_nn_kernel_scalar_release( &means_node_params[MEANS_GROUP_NUM_SCL] ); + } + + /* dst = x * a + b */ + status = _query_kernel( kernel, inputs, outputs, reshape_flg, INTERNAL_KERNEL_NORMS ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + norms_node = vsi_nn_kernel_create_node( graph, kernel ); + if (norms_node) + { + uint32_t index = 0; + node_params[index++] = rs_input; + node_params[index++] = tensors[MEANS_INDEX]->t; + node_params[index++] = rs_output; + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + + status = vsi_nn_kernel_node_pass_param( norms_node, node_params, + _INSTANCENORM_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[3] ); } - /* Pass parameters to node. */ final: vsi_safe_release_tensor(reshape_tensor[0]); vsi_safe_release_tensor(reshape_tensor[1]); @@ -934,8 +1036,10 @@ final: } vsi_safe_release_tensor(tensors[i]); } - if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} - return node; + if (sums_node) {vsi_nn_kernel_node_release( &sums_node );} + if (means_node) {vsi_nn_kernel_node_release( &means_node );} + + return norms_node; } /* _setup() */ __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c index 5825491..8157779 100644 --- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -121,6 +121,7 @@ static const struct { TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8, U8, KERNEL_SOURCE_4) TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, F16, KERNEL_SOURCE_5) TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, U8, KERNEL_SOURCE_5) + TENSOR_MATRIX_MUL_TRANSB_KERNELS(I16, I16, I16, KERNEL_SOURCE_13) TENSOR_MATRIX_MUL_TRANSB_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15) TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8, U8, U8, KERNEL_SOURCE_7) TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8, I8, I8, KERNEL_SOURCE_7) @@ -622,11 +623,33 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniI16MulI16SumtoI32_16x1 = {{ + 0xaaaa5555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0xaaaa5555, // BSelt + 0x76543210, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00020001, 0x00040003, 0x00060005, 0x00080007 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniI16MulI16SumtoI32B_16x1 = {{ + 0x0002aaab, // TCfg + 0x00015554, // ASelt + 0x65432100, 0x00000007, // ABin + 0x0002aaa8, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002300, // AccumType, ConstantType, and PostShift + 0x00010000, 0x00030002, 0x00050004, 0x00070006, + 0x00000008, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + float scaleIn0divOut = src0Scale / dstScale; float scaleIn1divOut = src1Scale / dstScale; float inScaleMul = src0Scale * src1Scale; float reScaleOut = 1 / dstScale; float inScaledivOut = inScaleMul / dstScale; + float inout_beta = src0ZP * src1ZP * 8 * inScaledivOut + dstZP; uint32_t multiplierA = (M0 << 16) | M0; uint32_t multiplierB = (M1 << 16) | M1; uint32_t multiplierZpA = (src0ZP << 16) | src0ZP; @@ -647,6 +670,14 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) uniGemmFp16U8MulZptoFp32_4x4.data[i] = multiplierZpB; uniGemmFp16I16MulZptoFp32_4x4.data[i] = multiplierZpB; } + for( i = 8; i < 12; i++) + { + uniI16MulI16SumtoI32B_16x1.data[i] = multiplierZpA; + } + for( i = 12; i < 16; i++) + { + uniI16MulI16SumtoI32_16x1.data[i] = multiplierZpB; + } switch( pack_key ) { @@ -746,6 +777,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) break; case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 0 ): case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 1 ): + case _PACK_SELECT_KEY( I16, I16, I16, 0, 1, 0 ): + case _PACK_SELECT_KEY( I16, I16, I16, 0, 1, 1 ): { status = vsi_nn_kernel_gpu_add_param( node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); @@ -753,10 +786,16 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 ); status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniI16MulI16SumtoI32_16x1", &uniI16MulI16SumtoI32_16x1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniI16MulI16SumtoI32B_16x1", &uniI16MulI16SumtoI32B_16x1 ); status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP ); status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP ); status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inScaledivOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "inout_beta", &inout_beta ); } break; case _PACK_SELECT_KEY( F16, U8, F16, 0, 0, 0 ): diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c index e92b248..2c529ce 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c @@ -43,14 +43,18 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8") #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI8") #define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toF16") // greater than a quarter #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq") #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOF16_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toF16_gq") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI16_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI16_gq") -#define KERNEL_SOURCE_1 "pre_process_nv12_scale_8bits", +#define KERNEL_SOURCE_1 "pre_process_nv12_copy", #define KERNEL_SOURCE_2 "pre_process_nv12_scale", -#define KERNEL_SOURCE_4 "pre_process_nv12_scale_mix" typedef enum { @@ -78,13 +82,18 @@ static const struct { const char* source_name; } pre_process_nv12_map[] = { - TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) - TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2) TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2) TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2) - TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_4) - TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2) }; static vx_param_description_t vxPreProcessNv12Kernel_param_def[] = @@ -120,8 +129,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - int32_t dstZP = 0; - float dstScale = 1; + float output_zp = 0; + float output_scale = 1; int32_t reorder = 0; int32_t order1 = 2; uint32_t width = 0; @@ -148,6 +157,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; + output_scale = 1.0f / attr[0]->scale; + output_zp = (float)attr[0]->zero_point; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); @@ -157,33 +168,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) order1 = 0; } - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - dstScale = 1.0f / attr[0]->asymm.scale; - dstZP = attr[0]->asymm.zero_point; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl)); - } - dstZP = 0; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) - { - dstScale = 1; - dstZP = 0; - } - - outputScaleVar = dstScale * var; - bMeanScaleVarZp = (float)dstZP - bMean * outputScaleVar; - gMeanScaleVarZp = (float)dstZP - gMean * outputScaleVar; - rMeanScaleVarZp = (float)dstZP - rMean * outputScaleVar; + outputScaleVar = output_scale * var; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar; shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; @@ -249,18 +237,46 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar); status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); + switch( attr[0]->dtype ) + { + case U8: + case I8: + case I16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case F16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } CHECK_STATUS_FAIL_GOTO(status, OnError ); } @@ -288,8 +304,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - int32_t dstZP = 0; - float dstScale = 1; + float output_zp = 0; + float output_scale = 1; int32_t reorder = 0; int32_t order1 = 2; uint32_t width = 0; @@ -330,8 +346,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[1]->shape; - dstZP = attr[1]->asymm.zero_point; - dstScale = attr[1]->asymm.scale; + output_scale = 1.0f / attr[1]->scale; + output_zp = (float)attr[1]->zero_point; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); @@ -347,32 +363,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1); yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1); - if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - dstScale = 1.0f / dstScale; - } - else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[1]->dfp.fl > 0) - { - dstScale = (vx_float32)((int64_t)1 << attr[1]->dfp.fl); - } - else - { - dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[1]->dfp.fl)); - } - dstZP = 0; - } - else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) - { - dstScale = 1; - dstZP = 0; - } - - outputScaleVar = dstScale * var; - bMeanScaleVarZp = (float)dstZP - bMean * outputScaleVar; - gMeanScaleVarZp = (float)dstZP - gMean * outputScaleVar; - rMeanScaleVarZp = (float)dstZP - rMean * outputScaleVar; + outputScaleVar = output_scale * var; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar; shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; @@ -482,7 +476,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); - if (resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16)) + if (resize >= 0.25) { status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8); @@ -499,13 +493,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) case I8: case I16: { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; case F16: { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8); + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; @@ -551,7 +545,7 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if (enable_copy && output_dtype == U8) + if (enable_copy) { convert_type = COPY; } @@ -560,16 +554,16 @@ static vsi_status _query_kernel convert_type = SCALE; } - if (scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE) + if (scaleVal >= 0.25 && convert_type == SCALE) { optFlg = 1; } key = HASH_PRE_PROCESS_NV12_KEY( input0_dtype, output_dtype, convert_type, optFlg ); - for( i = 0; i < _cnt_of_array(pre_process_nv12_map); i ++ ) + for ( i = 0; i < _cnt_of_array(pre_process_nv12_map); i ++ ) { - if( pre_process_nv12_map[i].key == key ) + if ( pre_process_nv12_map[i].key == key ) { break; } @@ -580,7 +574,7 @@ static vsi_status _query_kernel kernel->info.parameters = vxPreProcessNv12Kernel_param_def; kernel->info.numParams = _cnt_of_array( vxPreProcessNv12Kernel_param_def ); - if(convert_type == COPY) + if (convert_type == COPY) { kernel->info.initialize = _pre_process_nv12_copy_initializer; } @@ -666,10 +660,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[12] ); } } - if(reshape_tensors[0]) - { - vsi_nn_ReleaseTensor(&reshape_tensors[0]); - } + vsi_safe_release_tensor(reshape_tensors[0]); + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c index a51eab1..8e5f779 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c @@ -43,13 +43,13 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toI16") #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toF16") -#define KERNEL_SOURCE_1 "pre_process_yuv420_scale_u8", -#define KERNEL_SOURCE_2 "pre_process_yuv420_copy_u8", -#define KERNEL_SOURCE_3 "pre_process_yuv420_scale_fp16", -#define KERNEL_SOURCE_4 "pre_process_yuv420_scale_i16", -#define KERNEL_SOURCE_5 "pre_process_yuv420_scale_i8", +#define KERNEL_SOURCE_0 "pre_process_yuv420_copy", +#define KERNEL_SOURCE_1 "pre_process_yuv420_scale_0", +#define KERNEL_SOURCE_2 "pre_process_yuv420_scale_1", typedef enum { @@ -73,12 +73,14 @@ static const struct { const char* source_name; } pre_process_yuv420_map[] = { - TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_3) - TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2) TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) - TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_5) - TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2) - TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_0) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY, KERNEL_SOURCE_0) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, COPY, KERNEL_SOURCE_0) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, COPY, KERNEL_SOURCE_0) }; static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] = @@ -115,13 +117,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - int32_t dstZP = 0; - float dstScale = 1; - int32_t reorder = 0; - int32_t trans = 0; - int32_t order1 = 2; - uint32_t width = 0; - uint32_t height = 0; + float output_zp = 0; + float output_scale = 1; + int32_t reorder = 0; + int32_t trans = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; @@ -149,23 +151,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) width = width / 3; } - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - dstScale = 1.0f / attr[0]->asymm.scale; - dstZP = attr[0]->asymm.zero_point; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); - } - dstZP = 0; - } + output_scale = 1.0f / attr[0]->scale; + output_zp = (float)attr[0]->zero_point; shaderParam.global_scale[0] = 16; shaderParam.global_scale[1] = 1; @@ -426,8 +413,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) }, GPU_DP_TYPE_16 }; switch( attr[0]->dtype ) { + case I8: case U8: case F16: + case I16: { // R status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4); @@ -461,8 +450,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoR_2x8", &uniQuantU8toU8LoR_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiR_2x8", &uniQuantU8toU8HiR_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); CHECK_STATUS_FAIL_GOTO(status, OnError ); @@ -497,8 +486,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - int32_t dstZP = 0; - float dstScale = 1; + float output_zp = 0; + float output_scale = 1; int32_t reorder = 0; int32_t order1 = 2; uint32_t width = 0; @@ -513,11 +502,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); - out_shape = attr[0]->shape; - dstZP = attr[0]->asymm.zero_point; - dstScale = attr[0]->asymm.scale; - width = (uint32_t)(out_shape->data[0]); - height = (uint32_t)(out_shape->data[1]); + out_shape = attr[0]->shape; + output_zp = (float)attr[0]->zero_point; + output_scale = 1.0f / attr[0]->scale; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if (reorder != 0) { @@ -525,28 +514,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) order1 = 0; } - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl)); - } - dstZP = 0; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - dstScale = 1.0f / dstScale; - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - dstScale = 1; - dstZP = 0; - } - shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; @@ -822,24 +789,20 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) switch( attr[0]->dtype ) { case U8: - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); - status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; + case F16: case I8: case I16: { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case F16: - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8); + if (attr[0]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; @@ -876,12 +839,14 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if (enable_copy && (output_dtype == U8 || output_dtype == F16)) + if (enable_copy && (output_dtype == I8 || output_dtype == U8 || output_dtype == F16 || output_dtype == I16)) { convert_type = COPY; + enable_copy = TRUE; } else { + enable_copy = FALSE; convert_type = SCALE; } @@ -900,7 +865,7 @@ static vsi_status _query_kernel kernel->info.parameters = vxPreProcessYuv420Kernel_param_def; kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def ); - if (enable_copy && (output_dtype == U8 || output_dtype == F16)) + if (enable_copy) { kernel->info.initialize = _pre_process_yuv420_copy_initializer; } diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c new file mode 100644 index 0000000..ca76dfe --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c @@ -0,0 +1,623 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toF16") + +#define KERNEL_SOURCE_1 "pre_process_yuv422_copy", +#define KERNEL_SOURCE_2 "pre_process_yuv422_scale", + +typedef enum +{ + COPY = 0, + SCALE, + TRANS +} vsi_nn_kernel_convert_type_e; + + +// Add kernel hashtable here +#define HASH_PRE_PROCESS_YUV422_KEY(_input0_type, _output_type, _convert_type) \ + ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8)) + +#define TENSOR_PRE_PROCESS_YUV422_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_YUV422_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE), \ + VX_KERNEL_NAME_PRE_PROCESS_YUV422_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pre_process_yuv422_map[] = +{ + TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I8, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I16, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, F16, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2) +}; + +static vx_param_description_t vxPreProcessyuv422Kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PRE_PROCESS_YUV422_PARAM_NUM _cnt_of_array(vxPreProcessyuv422Kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + float output_zp = 0; + float output_scale = 1; + int32_t reorder = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f; + float outputScaleVar = 0.0f; + float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_size_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &rMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &gMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + output_scale = 1.0f / attr[0]->scale; + output_zp = (float)attr[0]->zero_point; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); + + if (reorder != 0) + { + reorder = 2; + order1 = 0; + } + + outputScaleVar = output_scale * var; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar; + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 1); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x00003333, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x00000000, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertYUV422toB_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00120010, 0x00560054, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000, + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertYUV422toG_4x4 = {{ + 0x29292929, // TCfg + 0x00000000, // ASelt + 0x03120310, 0x07560754, // ABin + 0x2a2a2a2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc, + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertYUV422toR_4x4 = {{ + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00320030, 0x00760074, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{ + 0x91919191, // TCfg + 0x40404040, // ASelt + 0x03020100, 0x07060504, // ABin + 0xa2a2a2a2, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000700, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00010001, 0x00000001, 0x00010001, + 0x00000001, 0x00010001, 0x00000001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar); + status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); + switch( attr[0]->dtype ) + { + case U8: + case I8: + case I16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case F16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_yuv422_copy_initializer() */ + + +DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + float output_zp = 0; + float output_scale = 1; + int32_t reorder = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + uint32_t roi_width = 0; + uint32_t roi_height = 0; + uint32_t xrIntFloat_16 = 0; + uint32_t yrIntFloat_16 = 0; + int32_t xRatio = 0; + int32_t yRatio = 0; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f; + float outputScaleVar = 0.0f; + float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_size_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &rMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &gMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + output_scale = 1.0f / attr[0]->scale; + output_zp = (float)attr[0]->zero_point; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); + + if (reorder != 0) + { + reorder = 2; + order1 = 0; + } + + roi_width = (xRatio * width) >> 15; + roi_height = (yRatio * height) >> 15; + xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1); + yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1); + + outputScaleVar = output_scale * var; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar; + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 1); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertYUV422toB_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3f323c00, 0x00000000, 0x3f323c00, 0x00000000, + 0x3f323c00, 0x00000000, 0x3f323c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertYUV422toG_4x4 = {{ + 0x29292929, // TCfg + 0x14141414, // ASelt + 0x05110400, 0x07330622, // ABin + 0x2a2a2a2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc, + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertYUV422toR_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00510040, 0x00730062, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16); + status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar); + status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( attr[0]->dtype ) + { + case U8: + case I8: + case I16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case F16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_yuv422_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params, + int32_t scale_x + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_convert_type_e convert_type = SCALE; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (enable_copy) + { + convert_type = COPY; + } + else + { + convert_type = SCALE; + } + + key = HASH_PRE_PROCESS_YUV422_KEY( input0_dtype, output_dtype, convert_type ); + + for ( i = 0; i < _cnt_of_array(pre_process_yuv422_map); i ++ ) + { + if ( pre_process_yuv422_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(pre_process_yuv422_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_yuv422_map[i].function_name ); + kernel->info.parameters = vxPreProcessyuv422Kernel_param_def; + kernel->info.numParams = _cnt_of_array( vxPreProcessyuv422Kernel_param_def ); + + if (convert_type == COPY) + { + kernel->info.initialize = _pre_process_yuv422_copy_initializer; + } + else + { + kernel->info.initialize = _pre_process_yuv422_initializer; + } + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + pre_process_yuv422_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pre_process_yuv422_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV422_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; + int32_t trans = 0; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params, scale_x ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 2; + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" ); + + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV422_PARAM_NUM, + inputs, 1, outputs, 1 ); + + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV422_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + vsi_nn_kernel_scalar_release( &tmp_params[8] ); + vsi_nn_kernel_scalar_release( &tmp_params[9] ); + vsi_nn_kernel_scalar_release( &tmp_params[10] ); + vsi_nn_kernel_scalar_release( &tmp_params[11] ); + vsi_nn_kernel_scalar_release( &tmp_params[12] ); + } + } + vsi_safe_release_tensor(reshape_tensors[0]); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pre_process_yuv422, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c index 7a3eeed..be1cd09 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c @@ -361,7 +361,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift); + gpu_quantize_multiplier_16bit((double)input_scale * (double)output_scale, &M0, &postShift); multAndoutZP[0] = (uint32_t)(M0); multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0); diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index 6896307..1e79cbf 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -1202,7 +1202,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer) if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr))) { - is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] = 0.125f); + is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] == 0.125f); } if (is_8x_align_corners) @@ -1595,6 +1595,37 @@ OnError: return scale; } +static vsi_bool _is_image_width_lt16 + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t *input, + int32_t pad_left, + int32_t pad_right + ) +{ + vsi_nn_kernel_dtype_e in_dtype = vsi_nn_kernel_map_dtype( input->attr.dtype.vx_type ); + vsi_size_t width = input->attr.size[0]; + size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype); + vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8; + + if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) + { + return FALSE; + } + + if (pad_left <= 0 || pad_right <= 0) + { + return FALSE; + } + + if (width + pad_left + pad_right > max_cross_read_img_width ) + { + return FALSE; + } + + return TRUE; +} + static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, @@ -1615,6 +1646,13 @@ static vsi_nn_kernel_node_t _setup vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2); vsi_bool is_run_opt_kernel = FALSE; vsi_nn_tensor_t* scale = NULL; + int32_t pad_left = half_pixel_centers ? 1 : 0; + int32_t pad_right = half_pixel_centers ? 1 : 0; + + if (_is_image_width_lt16(graph, inputs[0], pad_left, pad_right)) + { + return NULL; + } status = _query_kernel( kernel, inputs, outputs, is_same_type, is_evis2, align_corners, half_pixel_centers, &is_run_opt_kernel); diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c index 1b6d094..4d01893 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c @@ -371,7 +371,7 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift); + gpu_quantize_multiplier_16bit((double)input_scale * (double)output_scale, &M0, &postShift); multAndoutZP[0] = (uint32_t)(M0); multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0); diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c index 897f106..fae6ad7 100644 --- a/src/tim/vx/internal/src/kernel/evis/select_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c @@ -34,7 +34,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -82,6 +81,15 @@ static const _kernel_map_type _select_kernel_map[] = PACK_KERNEL_MAP(I8, F16, I16, F16), PACK_KERNEL_MAP(I8, I16, F16, F16), PACK_KERNEL_MAP(I8, F16, F16, U8), + PACK_KERNEL_MAP(I8, U8, F16, U8), + PACK_KERNEL_MAP(I8, F16, U8, U8), + PACK_KERNEL_MAP(I8, I8, F16, I8), + PACK_KERNEL_MAP(I8, F16, I8, I8), + PACK_KERNEL_MAP(I8, I16, F16, I16), + PACK_KERNEL_MAP(I8, F16, I16, I16), + PACK_KERNEL_MAP(I8, I8, I8, F16), + PACK_KERNEL_MAP(I8, U8, U8, F16), + PACK_KERNEL_MAP(I8, I16, I16, F16), PACK_KERNEL_MAP_2D(I8, I8, I8, I8), PACK_KERNEL_MAP_2D(I8, U8, U8, U8), PACK_KERNEL_MAP_2D(I8, I16, I16, I16), @@ -93,6 +101,15 @@ static const _kernel_map_type _select_kernel_map[] = PACK_KERNEL_MAP_2D(I8, F16, I16, F16), PACK_KERNEL_MAP_2D(I8, I16, F16, F16), PACK_KERNEL_MAP_2D(I8, F16, F16, U8), + PACK_KERNEL_MAP_2D(I8, U8, F16, U8), + PACK_KERNEL_MAP_2D(I8, F16, U8, U8), + PACK_KERNEL_MAP_2D(I8, I8, F16, I8), + PACK_KERNEL_MAP_2D(I8, F16, I8, I8), + PACK_KERNEL_MAP_2D(I8, I16, F16, I16), + PACK_KERNEL_MAP_2D(I8, F16, I16, I16), + PACK_KERNEL_MAP_2D(I8, I8, I8, F16), + PACK_KERNEL_MAP_2D(I8, U8, U8, F16), + PACK_KERNEL_MAP_2D(I8, I16, I16, F16), }; /* @@ -248,16 +265,26 @@ DEF_KERNEL_INITIALIZER(_select_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } break; - case _PACK_SELECT_KEY( I8, I8, I8 ): - case _PACK_SELECT_KEY( I16, I16, I16 ): - case _PACK_SELECT_KEY( U8, U8, U8 ): - case _PACK_SELECT_KEY( I8, F16, F16 ): - case _PACK_SELECT_KEY( U8, F16, F16 ): - case _PACK_SELECT_KEY( I16, F16, F16 ): - case _PACK_SELECT_KEY( F16, U8, F16 ): - case _PACK_SELECT_KEY( F16, I8, F16 ): - case _PACK_SELECT_KEY( F16, I16, F16 ): - case _PACK_SELECT_KEY( F16, F16, U8 ): + case _PACK_SELECT_KEY( I8, I8, I8 ): + case _PACK_SELECT_KEY( I16, I16, I16 ): + case _PACK_SELECT_KEY( U8, U8, U8 ): + case _PACK_SELECT_KEY( I8, F16, F16 ): + case _PACK_SELECT_KEY( U8, F16, F16 ): + case _PACK_SELECT_KEY( I16, F16, F16 ): + case _PACK_SELECT_KEY( F16, U8, F16 ): + case _PACK_SELECT_KEY( F16, I8, F16 ): + case _PACK_SELECT_KEY( F16, I16, F16 ): + case _PACK_SELECT_KEY( F16, F16, U8 ): + case _PACK_SELECT_KEY( U8, F16, U8 ): + case _PACK_SELECT_KEY( F16, U8, U8 ): + case _PACK_SELECT_KEY( I8, F16, I8 ): + case _PACK_SELECT_KEY( F16, I8, I8 ): + case _PACK_SELECT_KEY( I16, F16, I16 ): + case _PACK_SELECT_KEY( F16, I16, I16 ): + case _PACK_SELECT_KEY( I8, I8, F16 ): + case _PACK_SELECT_KEY( I16, I16, F16 ): + case _PACK_SELECT_KEY( U8, U8, F16 ): + case _PACK_SELECT_KEY( BF16, BF16, BF16 ): { uint32_t multAndoutZP0[2] = {0}; uint32_t multAndoutZP1[2] = {0}; @@ -367,9 +394,12 @@ static vsi_status _query_kernel out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); cond_dtype = (BOOL8 == cond_dtype || U8 == cond_dtype) ? I8 : cond_dtype; - in0_dtype = (BOOL8 == in0_dtype) ? I8 : in0_dtype; - in1_dtype = (BOOL8 == in1_dtype) ? I8 : in1_dtype; - out_dtype = (BOOL8 == out_dtype) ? I8 : out_dtype; + in0_dtype = (BOOL8 == in0_dtype) ? I8 : in0_dtype; + in0_dtype = (BF16 == in0_dtype) ? I16 : in0_dtype; + in1_dtype = (BOOL8 == in1_dtype) ? I8 : in1_dtype; + in1_dtype = (BF16 == in1_dtype) ? I16 : in1_dtype; + out_dtype = (BOOL8 == out_dtype) ? I8 : out_dtype; + out_dtype = (BF16 == out_dtype) ? I16 : out_dtype; key = SELECT_HASH_KEY(cond_dtype, in0_dtype, in1_dtype, out_dtype, image_2d); @@ -415,7 +445,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -424,10 +454,10 @@ static vsi_nn_kernel_node_t _setup image_2d = (outputs[0]->attr.dim_num == 2); status = _query_kernel( kernel, inputs, outputs, image_2d); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { /* Set inputs and outputs */ vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM, diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c index b9e46cd..50e43cf 100644 --- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c @@ -544,6 +544,8 @@ static vsi_nn_kernel_node_t _setup vsi_size_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1; vsi_size_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1; + shapes[1][2] = shapes[1][2] == 0 ? 1 : shapes[1][2]; + shapes[1][3] = shapes[1][3] == 0 ? 1 : shapes[1][3]; vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM, &reshape_tensors[0], 1, &reshape_tensors[1], 1 ); diff --git a/src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c b/src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c new file mode 100644 index 0000000..79d2e3a --- /dev/null +++ b/src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c @@ -0,0 +1,797 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_sp_unit_operation.h" +#include "kernel/vsi_nn_sp_lut.h" + +#if (VX_STREAM_PROCESSOR_SUPPORT) + +vsi_nn_spinst_t * vsi_nn_sp_moments_axis1_inst + ( + vx_context context, + int32_t fifo_depth, + int32_t max_vector_depth + ) +{ + vsi_status status = VSI_FAILURE; + const int32_t spInitInstsNum = fifo_depth == 1 ? 4 : 3; + const int32_t spLoopInstsNum = fifo_depth == 2 ? 4 : 3; + const int32_t spCompleteInstsNum = fifo_depth == 1 ? 3 : 0; + const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum + spCompleteInstsNum; + + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[11]; + vsi_nn_spinst_attr_t attr; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + + if (fifo_depth == 1) + { + /* init inst0: r3 = 0 */ + status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3); + /* init inst1: r1 = 0 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR1); + /* init inst2: r4 = 0 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4); + /* init inst3: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[3]); + CHECK_STATUS_FAIL_GOTO(status, final ); + + /* loop inst0: r5 = r1 * r1 || r1 = in */ + status = vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5); + status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SRIN, VSI_NN_SP_SR1); + /* loop inst1: r3 = r3 + r1 || out = r1 */ + status |= vsi_nn_sp_add(&sp_insts_param[5], VSI_NN_SP_SR3, VSI_NN_SP_SR1, VSI_NN_SP_SR3); + status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SROUT); + /* loop inst2: r5 = r5 + r4 */ + status |= vsi_nn_sp_add(&sp_insts_param[6], VSI_NN_SP_SR5, VSI_NN_SP_SR4, VSI_NN_SP_SR5); + CHECK_STATUS_FAIL_GOTO(status, final ); + + /* complete inst0: v11 = r3 */ + status = vsi_nn_sp_move(&sp_insts_param[7], VSI_NN_SP_SR3, VSI_NN_SP_VR11); + /* complete inst1: r3 = r3 + r1 || out = r1 */ + status |= vsi_nn_sp_nop(&sp_insts_param[8]); + /* complete inst2: v12 = r4 */ + status = vsi_nn_sp_move(&sp_insts_param[9], VSI_NN_SP_SR4, VSI_NN_SP_VR12); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.flush_cycle_num = 8; + } + else if (fifo_depth == 2) + { + /* init inst0: r3 = 0 */ + status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3); + /* init inst1: r2 = 1 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR2); + /* init inst2: r4 = 0 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4); + CHECK_STATUS_FAIL_GOTO(status, final ); + + /* loop inst0: out = r2 * r1 || v11 = r1 + r3 | r1 = in */ + status = vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR2, VSI_NN_SP_SR1, VSI_NN_SP_SROUT); + status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_VR11); + status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SRIN, VSI_NN_SP_SR1); + /* loop inst1: v12 = r4 + r5 | r3 = v11 */ + status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_VR12); + status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR3); + /* loop inst2: r4 = v12 */ + status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_VR12, VSI_NN_SP_SR4); + /* loop inst3: r5 = r1 * r1 */ + status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.flush_cycle_num = 5; + + attr.ignored_leading_v11_rd = fifo_depth; + attr.ignored_leading_v12_rd = fifo_depth; + attr.ignored_leading_v11_wr = 1; + attr.ignored_leading_v12_wr = 1; + + attr.num_of_v11_rd_in_flush_cycle = 1; + attr.num_of_v12_rd_in_flush_cycle = 1; + attr.num_of_v11_wr_in_flush_cycle = 1; + attr.num_of_v12_wr_in_flush_cycle = 2; + } + else + { + /* init inst0: r3 = 0 */ + status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3); + /* init inst1: r2 = 0 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR2); + /* init inst2: r4 = 0 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4); + CHECK_STATUS_FAIL_GOTO(status, final ); + + /* loop inst0: r5 = r1 * r1 | out = r2 + r1 || r1 = in */ + status = vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5); + status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR2, VSI_NN_SP_SR1, VSI_NN_SP_SROUT); + status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SRIN, VSI_NN_SP_SR1); + /* loop inst1: v11 = r1 + r3 | r3 = v11 */ + status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_VR11); + status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR3); + /* loop inst2: v12 = r4 + r5 | r4 = v12 */ + status |= vsi_nn_sp_add(&sp_insts_param[5], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_VR12); + status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_VR12, VSI_NN_SP_SR4); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.ignored_leading_v11_rd = fifo_depth; + attr.ignored_leading_v12_rd = fifo_depth; + attr.ignored_leading_v11_wr = 1; + attr.ignored_leading_v12_wr = 1; + + attr.num_of_v11_rd_in_flush_cycle = 1; + attr.num_of_v12_rd_in_flush_cycle = 1; + attr.num_of_v11_wr_in_flush_cycle = 2; + attr.num_of_v12_wr_in_flush_cycle = 2; + + attr.flush_cycle_num = 5; + } + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE; + attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT; + + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + attr.prog_complete_instr_num = spCompleteInstsNum; + attr.ignored_leading_outputs = 1; + attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET; + attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X; + attr.split_tilex_equal_imgx = TRUE; + attr.split_max_vector_depth = max_vector_depth; + + spinst = vsi_nn_create_spinst_by_context(context); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + return spinst; +} + +DEF_SP_KERNEL_QUERY(moements_axis1_query) + ( + vsi_nn_kernel_node_t node + ) +{ + vsi_status status = VSI_FAILURE; + vx_size index = 0; + vx_size tile_size[2] = {0}; + vsi_nn_spinst_t *spinst = NULL; + int32_t fifo_depth = 0; + int32_t max_vector_depth = 0; + vx_context ctx = vxGetContext((vx_reference)node); + vx_hardware_caps_params_ext2_t hw_param; + + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t)); + status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size)); + CHECK_STATUS_FAIL_GOTO( status, final ); + status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + fifo_depth = (int32_t)ceil((float)tile_size[0] / (float)hw_param.streamProcessorExecCount); + max_vector_depth = hw_param.streamProcessorVectorSize; + + spinst = vsi_nn_sp_moments_axis1_inst(ctx, fifo_depth, max_vector_depth); + + status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return status; +} + +vsi_nn_kernel_node_t vsi_nn_sp_moments_axis1_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output0, + vsi_nn_tensor_t * output1 + ) +{ + const uint32_t input_count = 1; + const uint32_t output_count = 2; + vx_tensor inputs_tensor[1] = {NULL}; + vx_tensor outputs_tensor[2] = {NULL}; + vx_node node = NULL; + int32_t max_vector_depth = graph->ctx->config.sp_vector_depth; + int32_t fifo_depth = 4; + + vsi_nn_spinst_t *spinst = NULL; + + spinst = vsi_nn_sp_moments_axis1_inst(graph->ctx->c, fifo_depth, max_vector_depth); + + inputs_tensor[0] = input->t; + outputs_tensor[0] = output0->t; + outputs_tensor[1] = output1->t; + node = vxStreamProcessorNode( + graph->g, + inputs_tensor, + input_count, + outputs_tensor, + output_count, + spinst->sp, + NULL); + + if (node) + { + vxAssignNodeQueryCallback(node, moements_axis1_query); + } + + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return (vsi_nn_kernel_node_t)node; +} + +vsi_nn_kernel_node_t vsi_nn_sp_ln_means_axis1_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + float inv_m, + float const_a, + float s, + float eps, + float output_scale + ) +{ + const int32_t spInitInstsNum = 2; + const int32_t spLoopInstsNum = 5; + const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum; + + const uint32_t input_count = 1; + const uint32_t output_count = 1; + vx_tensor inputs_tensor[1] = {NULL}; + vx_tensor outputs_tensor[1] = {NULL}; + vx_node node = NULL; + int32_t max_vector_depth = graph->ctx->config.sp_vector_depth; + + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[7]; + vsi_nn_spinst_attr_t attr; + vsi_nn_sp_lut_params sp_lut_params; + vx_lut_params_s vx_lut_params; + + vsi_status status = VSI_FAILURE; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params)); + memset(&vx_lut_params, 0, sizeof(vx_lut_params_s)); + + /* init inst0: r2 = const_a */ + status = vsi_nn_sp_move_constant(&sp_insts_param[0], const_a, VSI_NN_SP_SR2); + /* init inst1: r3 = inv_m */ + status = vsi_nn_sp_move_constant(&sp_insts_param[1], inv_m, VSI_NN_SP_SR3); + /* loop inst0: r4 = v11 * v11 || r6 = r4 + r5 || r5 = v11*/ + status = vsi_nn_sp_mul(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11, VSI_NN_SP_SR4); + status |= vsi_nn_sp_add(&sp_insts_param[2], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_SR6); + status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_SR5); + /* loop inst1: r1 = pwlMul() || r7 = pwlAdd() */ + status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR1); + status |= vsi_nn_sp_sub(&sp_insts_param[3], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR7); + /* loop inst2: r5 = r2 * v12 || v12 = r8 + r7 */ + status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_VR12, VSI_NN_SP_SR5); + status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR8, VSI_NN_SP_SR7, VSI_NN_SP_VR12); + /* loop inst3: r1 = setup(r6) || v11 = r3 * r5 || r7 = r1 */ + status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR6, VSI_NN_SP_SR1); + status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR3, VSI_NN_SP_SR5, VSI_NN_SP_VR11); + status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR7); + /* loop inst3: r8 = r1 * r7 */ + status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SR7, VSI_NN_SP_SR8); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE; + + attr.input_setup = VSI_NN_SP_INPUT_SETUP_V11; + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + attr.ignored_leading_outputs = 0; + attr.ignored_leading_v11_wr = 0; + attr.ignored_leading_v12_wr = 3; + attr.ignored_leading_v11_rd = 0; + attr.flush_cycle_num = 17; + + attr.num_of_v11_rd_in_flush_cycle = 0; + attr.num_of_v12_rd_in_flush_cycle = 1; + attr.num_of_v11_wr_in_flush_cycle = 1; + attr.num_of_v12_wr_in_flush_cycle = 4; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X; + attr.split_tilex_equal_imgx = TRUE; + attr.split_max_vector_depth = max_vector_depth; + + spinst = vsi_nn_create_spinst(graph); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + + inputs_tensor[0] = input->t; + outputs_tensor[0] = output->t; + + vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM; + vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE); + vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE); + + sp_lut_params.act_type = VSI_NN_SP_ACT_LINEAR_RSQRT; + sp_lut_params.params[0] = s; + sp_lut_params.params[1] = eps; + sp_lut_params.params[2] = output_scale; + vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params); + + node = vxStreamProcessorNode( + graph->g, + inputs_tensor, + input_count, + outputs_tensor, + output_count, + spinst->sp, + &vx_lut_params); + +final: + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + if (vx_lut_params.in_lut) + { + vxReleaseLUT(&vx_lut_params.in_lut); + vx_lut_params.in_lut = NULL; + } + if (vx_lut_params.out_lut) + { + vxReleaseLUT(&vx_lut_params.out_lut); + vx_lut_params.out_lut = NULL; + } + + return (vsi_nn_kernel_node_t)node; +} + +vsi_nn_spinst_t * vsi_nn_sp_layer_norm_axis1_inst + ( + vx_context context, + int32_t fifo_depth, + int32_t max_vector_depth + ) +{ + vsi_status status = VSI_FAILURE; + const int32_t spInitInstsNum = 0; + const int32_t spLoopInstsNum = fifo_depth > 3 ? 2 : 5; + const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum; + + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[5]; + vsi_nn_spinst_attr_t attr; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + + if (fifo_depth > 3) + { + /* loop inst0: out = in - v11 || v11 = v11 */ + status = vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1); + status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR11, VSI_NN_SP_VR11); + /* loop inst1: out = r1 * v12 | v12 = v12 */ + status |= vsi_nn_sp_mul(&sp_insts_param[1], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT); + status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_VR12, VSI_NN_SP_VR12); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.flush_cycle_num = 3; + attr.ignored_leading_v12_rd = 1; + attr.ignored_leading_v12_wr = 1; + + attr.num_of_v11_rd_in_flush_cycle = 0; + attr.num_of_v12_rd_in_flush_cycle = 2; + attr.num_of_v11_wr_in_flush_cycle = 0; + attr.num_of_v12_wr_in_flush_cycle = 2; + } + else + { + /* loop inst0: out = in - v11 || v11 = v11 */ + status = vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1); + status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR11, VSI_NN_SP_VR11); + /* loop inst1: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[1]); + /* loop inst2: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[2]); + /* loop inst3: out = r1 * v12 | v12 = v12 */ + status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT); + status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_VR12); + /* loop inst4: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[4]); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.flush_cycle_num = 4; + attr.ignored_leading_v12_rd = 0; + attr.ignored_leading_v12_wr = 0; + + attr.num_of_v11_rd_in_flush_cycle = 0; + attr.num_of_v12_rd_in_flush_cycle = 1; + attr.num_of_v11_wr_in_flush_cycle = 0; + attr.num_of_v12_wr_in_flush_cycle = 1; + } + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE; + attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT; + + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + attr.ignored_leading_outputs = 0; + attr.ignored_leading_v11_rd = 0; + attr.ignored_leading_v11_wr = 0; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X; + attr.split_tilex_equal_imgx = TRUE; + attr.split_max_vector_depth = max_vector_depth; + + spinst = vsi_nn_create_spinst_by_context(context); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + return spinst; +} + +DEF_SP_KERNEL_QUERY(layer_norm_axis1_query) + ( + vsi_nn_kernel_node_t node + ) +{ + vsi_status status = VSI_FAILURE; + vx_size index = 0; + vx_size tile_size[2] = {0}; + vsi_nn_spinst_t *spinst = NULL; + int32_t fifo_depth = 0; + int32_t max_vector_depth = 0; + vx_context ctx = vxGetContext((vx_reference)node); + vx_hardware_caps_params_ext2_t hw_param; + + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t)); + status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size)); + CHECK_STATUS_FAIL_GOTO( status, final ); + status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + fifo_depth = (int32_t)ceil((float)tile_size[0] / (float)hw_param.streamProcessorExecCount); + max_vector_depth = hw_param.streamProcessorVectorSize; + + spinst = vsi_nn_sp_layer_norm_axis1_inst(ctx, fifo_depth, max_vector_depth); + + status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return status; +} + +vsi_nn_kernel_node_t vsi_nn_sp_layer_norm_axis1_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input0, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * output + ) +{ + const uint32_t input_count = 2; + const uint32_t output_count = 1; + vx_tensor inputs_tensor[2] = {NULL}; + vx_tensor outputs_tensor[1] = {NULL}; + vx_node node = NULL; + int32_t max_vector_depth = graph->ctx->config.sp_vector_depth; + int32_t fifo_depth = 4; + vsi_nn_spinst_t *spinst = NULL; + + spinst = vsi_nn_sp_layer_norm_axis1_inst(graph->ctx->c, fifo_depth, max_vector_depth); + + inputs_tensor[0] = input0->t; + inputs_tensor[1] = input1->t; + outputs_tensor[0] = output->t; + node = vxStreamProcessorNode( + graph->g, + inputs_tensor, + input_count, + outputs_tensor, + output_count, + spinst->sp, + NULL); + + if (node) + { + vxAssignNodeQueryCallback(node, layer_norm_axis1_query); + } + + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return (vsi_nn_kernel_node_t)node; +} + +vsi_nn_kernel_node_t vsi_nn_sp_load_weight_bias_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias, + vsi_nn_tensor_t * dummy_output + ) +{ + const int32_t spLoopInstsNum = 2; + const int32_t spInstsNum = spLoopInstsNum; + + const uint32_t input_count = 2; + const uint32_t output_count = 1; + vx_tensor inputs_tensor[2] = {NULL}; + vx_tensor outputs_tensor[2] = {NULL}; + vx_node node = NULL; + int32_t max_vector_depth = graph->ctx->config.sp_vector_depth / + graph->ctx->config.sp_exec_count; + + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[2]; + vsi_nn_spinst_attr_t attr; + + vsi_status status = VSI_FAILURE; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + + /* loop inst0: v11 = in*/ + status = vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11); + /* loop inst0: v12 = in*/ + status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_SRIN, VSI_NN_SP_VR12); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE; + attr.input_setup = VSI_NN_SP_INPUT_SETUP_INTERLEAVE_TWO_INPUT; + + attr.prog_loop_instr_num = spLoopInstsNum; + attr.ignored_leading_outputs = 0; + attr.flush_cycle_num = 0; + attr.ignored_leading_v11_rd = 0; + attr.ignored_leading_v11_wr = 0; + attr.ignored_leading_v12_rd = 0; + attr.ignored_leading_v12_wr = 0; + attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET; + attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET; + attr.ch0_post_redistribute = VSI_NN_SP_CH_POST_REDISTRIBUTE_VECTOR_GATHER; + attr.ch1_post_redistribute = VSI_NN_SP_CH_POST_REDISTRIBUTE_VECTOR_GATHER; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ; + attr.split_max_vector_depth = max_vector_depth; + + spinst = vsi_nn_create_spinst(graph); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + + inputs_tensor[0] = weight->t; + inputs_tensor[1] = bias->t; + outputs_tensor[0] = dummy_output->t; + + node = vxStreamProcessorNode( + graph->g, + inputs_tensor, + input_count, + outputs_tensor, + output_count, + spinst->sp, + NULL); + +final: + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return (vsi_nn_kernel_node_t)node; +} + +vsi_nn_kernel_node_t vsi_nn_sp_in_times_v11_plus_v12_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * dummy_tensor, + vsi_nn_tensor_t * output + ) +{ + const int32_t spLoopInstsNum = 1; + const int32_t spInstsNum = spLoopInstsNum; + + const uint32_t input_count = 2; + const uint32_t output_count = 1; + vx_tensor inputs_tensor[3] = {NULL}; + vx_tensor outputs_tensor[1] = {NULL}; + vx_node node = NULL; + int32_t max_vector_depth = graph->ctx->config.sp_vector_depth / + graph->ctx->config.sp_exec_count; + + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[1]; + vsi_nn_spinst_attr_t attr; + + vsi_status status = VSI_FAILURE; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + + /* loop inst0: r1 = in * v11 || out = r1 + v12 */ + status = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1); + status |= vsi_nn_sp_add(&sp_insts_param[0], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE; + + attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT; + attr.prog_loop_instr_num = spLoopInstsNum; + attr.ignored_leading_outputs = 3; + attr.ignored_leading_v11_rd = 0; + attr.ignored_leading_v12_rd = 3; + attr.flush_cycle_num = 3; + attr.v11_push_pop_config = VSI_NN_SP_PUSH_POP_EVERY_ROW; + attr.v12_push_pop_config = VSI_NN_SP_PUSH_POP_EVERY_ROW; + + attr.num_of_v11_rd_in_flush_cycle = 0; + attr.num_of_v12_rd_in_flush_cycle = 3; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ; + attr.split_max_vector_depth = max_vector_depth; + + spinst = vsi_nn_create_spinst(graph); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + + inputs_tensor[0] = input->t; + inputs_tensor[1] = dummy_tensor->t; + outputs_tensor[0] = output->t; + node = vxStreamProcessorNode( + graph->g, + inputs_tensor, + input_count, + outputs_tensor, + output_count, + spinst->sp, + NULL); + +final: + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return (vsi_nn_kernel_node_t)node; +} + +/* +** This program requires sum operation in the Y dimension. +** Instead of using the SUM Engine, the sum needs to be performed +** by Stream Processor instructions. +*/ +vsi_nn_kernel_node_t layer_norm_y_direction + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t * dummy_tensor[3] = {NULL}; + vsi_nn_tensor_t * output_tensor[2] = {NULL}; + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + float inv_m = 1.0f / (float)(outputs[0]->attr.size[0]); + float s = inv_m * inv_m; + float const_a = (float)(outputs[0]->attr.size[0]); + + memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.is_dummy = TRUE; + attr.size[axis] = 1; + dummy_tensor[0] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( dummy_tensor[0], "Create dummy_tensor fail.", final ); + dummy_tensor[1] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( dummy_tensor[1], "Create dummy_tensor fail.", final ); + memcpy( &attr.size, &inputs[2]->attr.size, sizeof(inputs[2]->attr.size) ); + attr.dim_num = inputs[2]->attr.dim_num; + dummy_tensor[2] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( dummy_tensor[2], "Create dummy_tensor fail.", final ); + + memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + output_tensor[0] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( output_tensor[0], "Create tensor fail.", final ); + output_tensor[1] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( output_tensor[1], "Create tensor fail.", final ); + + node = vsi_nn_sp_moments_axis1_node(graph, inputs[0], output_tensor[0], dummy_tensor[0]); + CHECK_PTR_FAIL_GOTO( node, "Create sp_moments_axis1 fail.", final ); + node = vsi_nn_sp_ln_means_axis1_node(graph, dummy_tensor[0], dummy_tensor[1], + inv_m, const_a, s, eps, output_scale); + CHECK_PTR_FAIL_GOTO( node, "Create ln_y_dirction_means fail.", final ); + node = vsi_nn_sp_layer_norm_axis1_node(graph, output_tensor[0], dummy_tensor[1], output_tensor[1]); + CHECK_PTR_FAIL_GOTO( node, "Create layer_norm_axis1 fail.", final ); + + node = vsi_nn_sp_load_weight_bias_node(graph, inputs[2], inputs[1], dummy_tensor[2]); + CHECK_PTR_FAIL_GOTO( node, "Create mov_weight_bias fail.", final ); + node = vsi_nn_sp_in_times_v11_plus_v12_node(graph, output_tensor[1], dummy_tensor[2], outputs[0]); + CHECK_PTR_FAIL_GOTO( node, "Create in_times_v11_plus_v12 fail.", final ); + +final: + vsi_safe_release_tensor(dummy_tensor[0]); + vsi_safe_release_tensor(dummy_tensor[1]); + vsi_safe_release_tensor(dummy_tensor[2]); + vsi_safe_release_tensor(output_tensor[0]); + vsi_safe_release_tensor(output_tensor[1]); + + return node; +} /* layer_norm_y_direction() */ + + +#endif diff --git a/src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c b/src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c new file mode 100644 index 0000000..cb550c2 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c @@ -0,0 +1,938 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_sp_unit_operation.h" +#include "kernel/vsi_nn_sp_lut.h" + +#if (VX_STREAM_PROCESSOR_SUPPORT) + +vsi_nn_spinst_t * vsi_nn_sp_max_axis2_inst + ( + vx_context context, + int32_t fifo_depth, + int32_t max_vector_depth + ) +{ + vsi_status status = VSI_FAILURE; + const int32_t spInitInstsNum = 4; + const int32_t spLoopInstsNum = fifo_depth > 4 ? 3 : 11; + const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum; + uint32_t f32_min = 0xff800000; + float clampMin = *(float*)&f32_min; + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[15]; + vsi_nn_spinst_attr_t attr; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + + /* init inst0: r2 = -INF */ + status = vsi_nn_sp_move_constant(&sp_insts_param[0], clampMin, VSI_NN_SP_SR2); + /* init inst1: r10 = 0 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR10); + /* init inst2: r4 = 1 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 1, VSI_NN_SP_SR4); + /* init inst3: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[3]); + CHECK_STATUS_FAIL_GOTO(status, final); + + if (fifo_depth > 4) + { + /* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */ + status = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1); + status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2); + status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_SR9); + /* loop inst1: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */ + status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8); + status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5); + status |= vsi_nn_sp_move_sel0(&sp_insts_param[5], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11); + /* loop inst2: out = r1 */ + status |= vsi_nn_sp_move(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SROUT); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.flush_cycle_num = 7; + + attr.ignored_leading_outputs = 1; + attr.ignored_leading_v11_rd = fifo_depth; + attr.ignored_leading_v11_wr = 2; + + attr.num_of_v11_rd_in_flush_cycle = 0; + attr.num_of_v11_wr_in_flush_cycle = 3; + } + else + { + /* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */ + status = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1); + status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2); + /* loop inst1: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[5]); + /* loop inst2: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[6]); + /* loop inst3: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */ + status |= vsi_nn_sp_mul(&sp_insts_param[7], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8); + status |= vsi_nn_sp_sub(&sp_insts_param[7], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5); + status |= vsi_nn_sp_move(&sp_insts_param[7], VSI_NN_SP_SR2, VSI_NN_SP_SR9); + /* loop inst4: out = r1 */ + status |= vsi_nn_sp_move(&sp_insts_param[8], VSI_NN_SP_SR1, VSI_NN_SP_SROUT); + /* loop inst5: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[9]); + /* loop inst6: nop */ + status |= vsi_nn_sp_move_sel0(&sp_insts_param[10], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11); + /* loop inst7: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[11]); + /* loop inst8: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[12]); + /* loop inst9: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[13]); + /* loop inst10: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[14]); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.ignored_leading_outputs = 0; + attr.ignored_leading_v11_rd = fifo_depth; + attr.ignored_leading_v11_wr = 0; + + attr.num_of_v11_rd_in_flush_cycle = 0; + attr.num_of_v11_wr_in_flush_cycle = 1; + + attr.flush_cycle_num = 10; + } + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE; + attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT; + + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY; + attr.split_tilex_equal_imgx = TRUE; + attr.split_max_vector_depth = max_vector_depth; + + spinst = vsi_nn_create_spinst_by_context(context); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + return spinst; +} + +DEF_SP_KERNEL_QUERY(max_axis2_query) + ( + vsi_nn_kernel_node_t node + ) +{ + vsi_status status = VSI_FAILURE; + vx_size index = 0; + vx_size tile_size[2] = {0}; + vsi_nn_spinst_t *spinst = NULL; + int32_t fifo_depth = 0; + int32_t max_vector_depth = 0; + vx_context ctx = vxGetContext((vx_reference)node); + vx_hardware_caps_params_ext2_t hw_param; + + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t)); + status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size)); + CHECK_STATUS_FAIL_GOTO( status, final ); + status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1]) / (float)hw_param.streamProcessorExecCount); + max_vector_depth = hw_param.streamProcessorVectorSize; + + spinst = vsi_nn_sp_max_axis2_inst(ctx, fifo_depth, max_vector_depth); + + status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return status; +} + +vsi_nn_kernel_node_t vsi_nn_sp_max_axis2_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output0, + vsi_nn_tensor_t * output1 + ) +{ + const int32_t spInitInstsNum = 4; + const int32_t spLoopInstsNum = 3; + const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum; + + const uint32_t input_count = 1; + const uint32_t output_count = 2; + vx_tensor inputs_tensor[1] = {NULL}; + vx_tensor outputs_tensor[2] = {NULL}; + vx_node node = NULL; + + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[7]; + vsi_nn_spinst_attr_t attr; + + vsi_status status = VSI_FAILURE; + int32_t max_vector_depth = graph->ctx->config.sp_vector_depth; + uint32_t f32_min = 0xff800000; + float flt_min = *(float*)&f32_min; + float input_scale = vsi_nn_get_tensor_scale(input); + float clamp_min = 0; + float clamp_max = 0; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + + vsi_nn_get_tensor_clamp_min_max(input, &clamp_min, &clamp_max); + clamp_min = clamp_min * input_scale; + clamp_max = clamp_max * input_scale; + + /* init inst0: r2 = -INF */ + status = vsi_nn_sp_move_constant(&sp_insts_param[0], flt_min, VSI_NN_SP_SR2); + /* init inst1: r10 = 0 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR10); + /* init inst2: r4 = 1 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 1, VSI_NN_SP_SR4); + /* init inst3: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[3]); + CHECK_STATUS_FAIL_GOTO(status, final); + + /* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */ + status = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1); + status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2); + status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_SR9); + /* loop inst1: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */ + status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8); + status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5); + status |= vsi_nn_sp_move_sel0(&sp_insts_param[5], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11); + /* loop inst2: out = r1 */ + status |= vsi_nn_sp_move(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SROUT); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE; + attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT; + + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY; + attr.split_tilex_equal_imgx = TRUE; + attr.split_max_vector_depth = max_vector_depth; + + attr.flush_cycle_num = 7; + + attr.ignored_leading_outputs = 1; + attr.ignored_leading_v11_rd = 5; + attr.ignored_leading_v11_wr = 2; + + attr.num_of_v11_rd_in_flush_cycle = 0; + attr.num_of_v11_wr_in_flush_cycle = 3; + + VSI_NN_SP_ATTR_SET_CONST_TO_SR3(attr, input_scale); + VSI_NN_SP_ATTR_SET_CONST_TO_SR6(attr, clamp_max); + VSI_NN_SP_ATTR_SET_CONST_TO_SR7(attr, clamp_min); + + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + + spinst = vsi_nn_create_spinst(graph); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + + inputs_tensor[0] = input->t; + outputs_tensor[0] = output0->t; + outputs_tensor[1] = output1->t; + node = vxStreamProcessorNode( + graph->g, + inputs_tensor, + input_count, + outputs_tensor, + output_count, + spinst->sp, + NULL); + +final: + + if (node) + { + vxAssignNodeQueryCallback(node, max_axis2_query); + } + + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return (vsi_nn_kernel_node_t)node; +} + +vsi_nn_spinst_t * vsi_nn_sp_exp_y_direction_inst + ( + vx_context context, + int32_t fifo_depth, + int32_t max_vector_depth + ) +{ + vsi_status status = VSI_FAILURE; + const int32_t spInitInstsNum = 2; + const int32_t spLoopInstsNum = fifo_depth > 3 ? 4 : 8; + const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum; + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[10]; + vsi_nn_spinst_attr_t attr; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + + /* init inst0: r8 = 0 */ + status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR8); + /* init inst1: r9 = 1 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR9); + CHECK_STATUS_FAIL_GOTO(status, final); + + if (fifo_depth > 3) + { + /* loop inst0: r2 = in - v11 | v11 = v11 */ + status = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2); + status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11); + /* loop inst1: r8 = v12 * r9 | r7 = r4 + r6 | out = r7 */ + status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8); + status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7); + status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR7, VSI_NN_SP_SROUT); + /* loop inst2: r6 = r5 * r2 | v12 = r7 + r8 | r4 = r3 */ + status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6); + status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12); + status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SR4); + /* loop inst3: r1 = setup(r2) | r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/ + status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1); + status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5); + status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2); + status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.flush_cycle_num = 18; + + attr.ignored_leading_outputs = 4; + attr.ignored_leading_v11_rd = 0; + attr.ignored_leading_v11_wr = 0; + attr.ignored_leading_v12_rd = fifo_depth + 3; + attr.ignored_leading_v12_wr = 4; + + attr.num_of_v12_rd_in_flush_cycle = 4; + attr.num_of_v12_wr_in_flush_cycle = 5; + } + else + { + /* loop inst0: r2 = in - v11 | v11 = v11 */ + status = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2); + status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11); + /* loop inst1: r6 = r5 * r2 | r4 = r3 */ + status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6); + status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR3, VSI_NN_SP_SR4); + /* loop inst2: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[4]); + /* loop inst3: r1 = setup(r2) */ + status = vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1); + /* loop inst4: r8 = v12 * r9 | r7 = r4 + r6 */ + status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8); + status |= vsi_nn_sp_add(&sp_insts_param[6], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7); + /* loop inst5: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[7]); + /* loop inst6: r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/ + status |= vsi_nn_sp_mul(&sp_insts_param[8], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5); + status |= vsi_nn_sp_sub(&sp_insts_param[8], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2); + status |= vsi_nn_sp_move(&sp_insts_param[8], VSI_NN_SP_SR1, VSI_NN_SP_SR3); + /* loop inst7: v12 = r7 + r8 | out = r7 */ + status |= vsi_nn_sp_add(&sp_insts_param[9], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12); + status |= vsi_nn_sp_move(&sp_insts_param[9], VSI_NN_SP_SR7, VSI_NN_SP_SROUT); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.ignored_leading_outputs = 1; + attr.ignored_leading_v11_rd = 0; + attr.ignored_leading_v11_wr = 0; + attr.ignored_leading_v12_rd = fifo_depth + 1; + attr.ignored_leading_v12_wr = 1; + + attr.num_of_v12_rd_in_flush_cycle = 2; + attr.num_of_v12_wr_in_flush_cycle = 2; + + attr.flush_cycle_num = 15; + } + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE; + attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT; + + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY; + attr.split_tilex_equal_imgx = TRUE; + attr.split_max_vector_depth = max_vector_depth; + + spinst = vsi_nn_create_spinst_by_context(context); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + return spinst; +} + +DEF_SP_KERNEL_QUERY(softmax_z_direction_exp_query) + ( + vsi_nn_kernel_node_t node + ) +{ + vsi_status status = VSI_FAILURE; + vx_size index = 0; + vx_size tile_size[2] = {0}; + vsi_nn_spinst_t *spinst = NULL; + int32_t fifo_depth = 0; + int32_t max_vector_depth = 0; + vx_context ctx = vxGetContext((vx_reference)node); + vx_hardware_caps_params_ext2_t hw_param; + + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t)); + status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size)); + CHECK_STATUS_FAIL_GOTO( status, final ); + status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1])/ (float)hw_param.streamProcessorExecCount); + max_vector_depth = hw_param.streamProcessorVectorSize; + + spinst = vsi_nn_sp_exp_y_direction_inst(ctx, fifo_depth, max_vector_depth); + + status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return status; +} + +vsi_nn_kernel_node_t vsi_nn_sp_softmax_z_direction_exp_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input0, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * output0, + vsi_nn_tensor_t * output1, + float beta + ) +{ + const int32_t spInitInstsNum = 2; + const int32_t spLoopInstsNum = 4; + const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum; + + const uint32_t input_count = 2; + const uint32_t output_count = 2; + vx_tensor inputs_tensor[2] = {NULL}; + vx_tensor outputs_tensor[2] = {NULL}; + vx_node node = NULL; + + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[6]; + vsi_nn_spinst_attr_t attr; + + vsi_nn_sp_lut_params sp_lut_params; + vx_lut_params_s vx_lut_params; + + vsi_status status = VSI_FAILURE; + int32_t fifo_depth = 4; + int32_t max_vector_depth = graph->ctx->config.sp_vector_depth; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params)); + memset(&vx_lut_params, 0, sizeof(vx_lut_params_s)); + + /* init inst0: r8 = 0 */ + status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR8); + /* init inst1: r9 = 1 */ + status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR9); + CHECK_STATUS_FAIL_GOTO(status, final); + + /* loop inst0: r2 = in - v11 | v11 = v11 */ + status = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2); + status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11); + /* loop inst1: r8 = v12 * r9 | r7 = r4 + r6 | out = r7 */ + status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8); + status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7); + status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR7, VSI_NN_SP_SROUT); + /* loop inst2: r6 = r5 * r2 | v12 = r7 + r8 | r4 = r3 */ + status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6); + status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12); + status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SR4); + /* loop inst3: r1 = setup(r2) | r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/ + status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1); + status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5); + status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2); + status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.flush_cycle_num = 18; + + attr.ignored_leading_outputs = 4; + attr.ignored_leading_v11_rd = 0; + attr.ignored_leading_v11_wr = 0; + attr.ignored_leading_v12_rd = fifo_depth + 3; + attr.ignored_leading_v12_wr = 4; + + attr.num_of_v12_rd_in_flush_cycle = 4; + attr.num_of_v12_wr_in_flush_cycle = 5; + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE; + attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT; + + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY; + attr.split_tilex_equal_imgx = TRUE; + attr.split_max_vector_depth = max_vector_depth; + + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + + spinst = vsi_nn_create_spinst(graph); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + + vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM; + vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE); + vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE); + + sp_lut_params.act_type = VSI_NN_SP_ACT_LINEAR_EXP; + sp_lut_params.params[0] = beta; + sp_lut_params.params[1] = 0; + vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params); + + inputs_tensor[0] = input0->t; + inputs_tensor[1] = input1->t; + outputs_tensor[0] = output0->t; + outputs_tensor[1] = output1->t; + node = vxStreamProcessorNode( + graph->g, + inputs_tensor, + input_count, + outputs_tensor, + output_count, + spinst->sp, + &vx_lut_params); + +final: + if (node) + { + vxAssignNodeQueryCallback(node, softmax_z_direction_exp_query); + } + + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + if (vx_lut_params.in_lut) + { + vxReleaseLUT(&vx_lut_params.in_lut); + vx_lut_params.in_lut = NULL; + } + + if (vx_lut_params.out_lut) + { + vxReleaseLUT(&vx_lut_params.out_lut); + vx_lut_params.out_lut = NULL; + } + + return (vsi_nn_kernel_node_t)node; +} +vsi_nn_kernel_node_t vsi_nn_sp_rcp_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + float output_scale + ) +{ + const int32_t spLoopInstsNum = 3; + const int32_t spInstsNum = spLoopInstsNum; + + const uint32_t input_count = 1; + const uint32_t output_count = 1; + vx_tensor inputs_tensor[1] = {NULL}; + vx_tensor outputs_tensor[1] = {NULL}; + vx_node node = NULL; + int32_t max_vector_depth = graph->ctx->config.sp_vector_depth; + + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[3]; + vsi_nn_spinst_attr_t attr; + + vsi_nn_sp_lut_params sp_lut_params; + vx_lut_params_s vx_lut_params; + + vsi_status status = VSI_FAILURE; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params)); + memset(&vx_lut_params, 0, sizeof(vx_lut_params_s)); + + /* loop inst0: r1 = pwlSetup(v12) | r5 = pwlMul() | r2 = pwlAdd() | r8 = r1 */ + status = vsi_nn_sp_pwl_setup0(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SR1); + status |= vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5); + status |= vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2); + status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_SR1, VSI_NN_SP_SR8); + /* loop inst1: r6 = r5 * r2 | r7 = r4 + r6 | r4 = r8 */ + status |= vsi_nn_sp_mul(&sp_insts_param[1], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6); + status |= vsi_nn_sp_add(&sp_insts_param[1], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7); + status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_SR4, VSI_NN_SP_SR8); + /* loop inst1: v12 = r7 * r3 */ + status |= vsi_nn_sp_mul(&sp_insts_param[2], VSI_NN_SP_SR7, VSI_NN_SP_SR3, VSI_NN_SP_VR12); + CHECK_STATUS_FAIL_GOTO(status, final ); + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE; + + attr.input_setup = VSI_NN_SP_INPUT_SETUP_V12; + attr.prog_loop_instr_num = spLoopInstsNum; + attr.ignored_leading_v12_wr = 4; + attr.ignored_leading_v12_rd = 0; + attr.flush_cycle_num = 14; + + attr.num_of_v12_wr_in_flush_cycle = 5; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ; + attr.split_max_vector_depth = max_vector_depth; + + VSI_NN_SP_ATTR_SET_CONST_TO_SR3(attr, 1.0f / output_scale); + + spinst = vsi_nn_create_spinst(graph); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + + inputs_tensor[0] = input->t; + outputs_tensor[0] = output->t; + + vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM; + vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE); + vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE); + + sp_lut_params.act_type = VSI_NN_SP_ACT_RCP; + vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params); + + node = vxStreamProcessorNode( + graph->g, + inputs_tensor, + input_count, + outputs_tensor, + output_count, + spinst->sp, + &vx_lut_params); + +final: + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + if (vx_lut_params.in_lut) + { + vxReleaseLUT(&vx_lut_params.in_lut); + vx_lut_params.in_lut = NULL; + } + if (vx_lut_params.out_lut) + { + vxReleaseLUT(&vx_lut_params.out_lut); + vx_lut_params.out_lut = NULL; + } + + return (vsi_nn_kernel_node_t)node; +} + +vsi_nn_spinst_t * vsi_nn_sp_times_inst + ( + vx_context context, + int32_t fifo_depth, + int32_t max_vector_depth + ) +{ + vsi_status status = VSI_FAILURE; + const int32_t spInitInstsNum = 0; + const int32_t spLoopInstsNum = fifo_depth > 4 ? 1 : fifo_depth > 1 ? 3 : 5; + const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum; + vsi_nn_spinst_t *spinst = NULL; + vsi_nn_spinst_inst_param sp_insts_param[5]; + vsi_nn_spinst_attr_t attr; + + memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum); + vsi_nn_init_spinst_attr(&attr); + + if (fifo_depth > 4) + { + /* loop inst0: out = v12 * in | v12 = v12 */ + status = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT); + status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (fifo_depth > 1) + { + /* loop inst0: out = v12 * in | v12 = v12 */ + status = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT); + status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12); + /* loop inst1: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[1]); + /* loop inst2: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[2]); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + /* loop inst0: out = v12 * in | v12 = v12 */ + status = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT); + status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12); + /* loop inst1: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[1]); + /* loop inst2: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[2]); + /* loop inst3: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[3]); + /* loop inst4: nop */ + status |= vsi_nn_sp_nop(&sp_insts_param[4]); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE; + attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT; + + attr.prog_init_instr_num = spInitInstsNum; + attr.prog_loop_instr_num = spLoopInstsNum; + + attr.flush_cycle_num = 0; + + attr.ignored_leading_outputs = 0; + attr.ignored_leading_v11_rd = 0; + attr.ignored_leading_v11_wr = 0; + + attr.num_of_v11_rd_in_flush_cycle = 0; + attr.num_of_v11_wr_in_flush_cycle = 0; + + attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY; + attr.split_tilex_equal_imgx = TRUE; + attr.split_max_vector_depth = max_vector_depth; + + spinst = vsi_nn_create_spinst_by_context(context); + CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final ); + status = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum); + status |= vsi_nn_set_spinst_attr(spinst, attr); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + return spinst; +} + +DEF_SP_KERNEL_QUERY(times_query) + ( + vsi_nn_kernel_node_t node + ) +{ + vsi_status status = VSI_FAILURE; + vx_size index = 0; + vx_size tile_size[2] = {0}; + vsi_nn_spinst_t *spinst = NULL; + int32_t fifo_depth = 0; + int32_t max_vector_depth = 0; + vx_context ctx = vxGetContext((vx_reference)node); + vx_hardware_caps_params_ext2_t hw_param; + + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t)); + status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size)); + CHECK_STATUS_FAIL_GOTO( status, final ); + status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index)); + CHECK_STATUS_FAIL_GOTO( status, final ); + + fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1]) / (float)hw_param.streamProcessorExecCount); + max_vector_depth = hw_param.streamProcessorVectorSize; + + spinst = vsi_nn_sp_times_inst(ctx, fifo_depth, max_vector_depth); + + status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return status; +} + +vsi_nn_kernel_node_t vsi_nn_sp_softmax_z_direction_times_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input0, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * output + ) +{ + const uint32_t input_count = 2; + const uint32_t output_count = 1; + vx_tensor inputs_tensor[2] = {NULL, NULL}; + vx_tensor outputs_tensor[1] = {NULL}; + vx_node node = NULL; + int32_t max_vector_depth = graph->ctx->config.sp_vector_depth; + int32_t fifo_depth = 5; + + vsi_nn_spinst_t *spinst = NULL; + + spinst = vsi_nn_sp_times_inst(graph->ctx->c, fifo_depth, max_vector_depth); + + inputs_tensor[0] = input0->t; + inputs_tensor[1] = input1->t; + outputs_tensor[0] = output->t; + node = vxStreamProcessorNode( + graph->g, + inputs_tensor, + input_count, + outputs_tensor, + output_count, + spinst->sp, + NULL); + + if (node) + { + vxAssignNodeQueryCallback(node, times_query); + } + + if (spinst) + { + vsi_nn_release_spinst(&spinst); + } + + return (vsi_nn_kernel_node_t)node; +} + +/* +** This program requires sum operation in the z dimension. +** Instead of using the SUM Engine, the sum needs to be performed +** by Stream Processor instructions. +*/ +vsi_nn_kernel_node_t softmax_z_direction + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + const vsi_nn_kernel_param_t * params + ) +{ + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t * dummy_tensor[3] = {NULL}; + vsi_nn_tensor_t * output_tensor[2] = {NULL}; + int32_t axis = 2; + float beta = vsi_nn_kernel_param_get_float32( params, "beta" ); + float output_scale = vsi_nn_get_tensor_scale(outputs[0]); + + memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.is_dummy = TRUE; + attr.size[axis] = 1; + dummy_tensor[0] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( dummy_tensor[0], "Create dummy_tensor fail.", final ); + dummy_tensor[1] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( dummy_tensor[1], "Create dummy_tensor fail.", final ); + dummy_tensor[2] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( dummy_tensor[2], "Create dummy_tensor fail.", final ); + + memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + output_tensor[0] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( output_tensor[0], "Create tensor fail.", final ); + output_tensor[1] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( output_tensor[1], "Create tensor fail.", final ); + + node = vsi_nn_sp_max_axis2_node(graph, inputs[0], output_tensor[0], dummy_tensor[0]); + CHECK_PTR_FAIL_GOTO( node, "Create sp_max_axis2 fail.", final ); + node = vsi_nn_sp_softmax_z_direction_exp_node(graph, output_tensor[0], dummy_tensor[0], + output_tensor[1], dummy_tensor[1], beta); + CHECK_PTR_FAIL_GOTO( node, "Create exp_y_direction fail.", final ); + node = vsi_nn_sp_rcp_node(graph, dummy_tensor[1], dummy_tensor[2], output_scale); + CHECK_PTR_FAIL_GOTO( node, "Create sp_rcp fail.", final ); + node = vsi_nn_sp_softmax_z_direction_times_node(graph, output_tensor[1], dummy_tensor[2], outputs[0]); + CHECK_PTR_FAIL_GOTO( node, "Create softmax_times fail.", final ); + +final: + vsi_safe_release_tensor(dummy_tensor[0]); + vsi_safe_release_tensor(dummy_tensor[1]); + vsi_safe_release_tensor(dummy_tensor[2]); + vsi_safe_release_tensor(output_tensor[0]); + vsi_safe_release_tensor(output_tensor[1]); + + return node; +} /* softmax_z_direction() */ + +#endif diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index e9a9272..aa47362 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -35,6 +35,7 @@ #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_math.h" #include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_dtype_util.h" #include "libnnext/vsi_nn_libnnext_resource.h" #if VSI_USE_VXC_BINARY @@ -118,7 +119,14 @@ static void _kernel_clear_source static vsi_bool _check_shader_support(vsi_nn_graph_t* graph); -static vsi_bool vsi_nn_kernel_is_asymmtric_int8 +static vsi_bool _check_stream_process_support + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t** inputs, + size_t input_num + ); + +vsi_bool vsi_nn_kernel_is_supported_types ( vsi_nn_tensor_t** inputs, size_t input_num, @@ -1222,7 +1230,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector /* Skip evis and cl when disable shader */ if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL) && ( _check_shader_support(graph) == FALSE || - vsi_nn_kernel_is_asymmtric_int8(inputs, input_num, outputs, output_num) ) ) + vsi_nn_kernel_is_supported_types(inputs, input_num, outputs, output_num) == FALSE ) ) { continue; } @@ -1234,8 +1242,8 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector } /* Skip StreamProcesor if not support */ - if( type == VSI_NN_KERNEL_TYPE_SP - && !graph->ctx->config.support_stream_processor ) + if( type == VSI_NN_KERNEL_TYPE_SP && + _check_stream_process_support(graph, inputs, input_num) == FALSE ) { continue; } @@ -1661,7 +1669,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph) return FALSE; } -static vsi_bool vsi_nn_kernel_is_asymmtric_int8 +vsi_bool vsi_nn_kernel_is_supported_types ( vsi_nn_tensor_t** inputs, size_t input_num, @@ -1673,25 +1681,45 @@ static vsi_bool vsi_nn_kernel_is_asymmtric_int8 for (i = 0; i < input_num; i++) { - if ( inputs[i] && - inputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && - inputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC - ) + if ( inputs[i] && vsi_nn_TypeGetBits(inputs[i]->attr.dtype.vx_type) == 4 ) { - return TRUE; + return FALSE; } } for (i = 0; i < output_num; i++) { - if ( outputs[i] && - outputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && - outputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC - ) + if ( outputs[i] && vsi_nn_TypeGetBits(outputs[i]->attr.dtype.vx_type) == 4 ) { - return TRUE; + return FALSE; } } - return FALSE; + return TRUE; +} + +static vsi_bool _check_stream_process_support + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t** inputs, + size_t input_num + ) +{ + if ( graph->ctx->config.support_stream_processor == 0 ) + { + return FALSE; + } + + if ( graph->ctx->config.sp_exec_count == 0 ) + { + return FALSE; + } + + if (inputs && input_num > 0 && + inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) + { + return FALSE; + } + + return TRUE; } \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c index 105027d..d78769e 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c @@ -653,4 +653,61 @@ vsi_bool vsi_nn_kernel_optimize_group_norm_shape } return status; -} \ No newline at end of file +} + +vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape + ( + const vsi_size_t* shape_x, const vsi_size_t rank_x, const int32_t axis, + vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size + ) +{ + vsi_bool ret = TRUE; + vsi_size_t i = 0; + vsi_size_t rank_in = 0; + vsi_size_t dims = 0; + vsi_size_t innerSize = 1; + vsi_size_t outerSize = 1; + vsi_size_t axisSize = shape_x[axis]; + + for (i = 0; i < (size_t)axis; i++) + { + innerSize *= shape_x[i]; + } + + for (i = axis + 1; i < rank_x; i++) + { + outerSize *= shape_x[i]; + } + + rank_in += element_fill_dim(out_shape_x, rank_in, max_size, innerSize); + dims = element_fill_dim(out_shape_x, rank_in, max_size, axisSize); + if (dims == 0) + { + *out_axis = (int32_t)rank_in; + out_shape_x[rank_in ++] = 1; + } + else + { + *out_axis = (int32_t)rank_in; + } + + rank_in += dims; + + rank_in += element_fill_dim(out_shape_x, rank_in, max_size, outerSize); + + if ( 0 == rank_in ) + { + out_shape_x[0] = 1; + out_shape_x[1] = 1; + rank_in = 2; + } + else if ( 1 == rank_in ) + { + out_shape_x[1] = 1; + rank_in = 2; + } + + *out_rank_x = (uint32_t)rank_in; + + return ret; +} /* vsi_nn_kernel_optimize_scatter_elements_shape() */ diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c index dd32c01..dfdc3dd 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c @@ -199,6 +199,31 @@ static float softsign_eval(float x) return x / (1 + vsi_abs(x)); } +static float linear_exp_eval(float x, vsi_nn_kernel_lut_params *lut_param) +{ + float a = lut_param->params[0]; + float b = lut_param->params[1]; + + return expf(x * a + b); +} + +static float linear_rsqrt_eval(float x, vsi_nn_kernel_lut_params *lut_param) +{ + float a = lut_param->params[0]; + float b = lut_param->params[1]; + float scale = lut_param->params[2]; + + return scale / sqrtf(a * x + b); +} + +static float linear_sigmoid_eval(float x, vsi_nn_kernel_lut_params *lut_param) +{ + float a = lut_param->params[0]; + float b = lut_param->params[1]; + + return 1.0f / (1 + expf(a * x + b));; +} + static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param) { float result = 0; @@ -261,6 +286,15 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params * case VSI_NN_KERNEL_LUT_SOFTSIGN: result = softsign_eval(data); break; + case VSI_NN_KERNEL_LUT_LINEAR_EXP: + result = linear_exp_eval(data, lut_param); + break; + case VSI_NN_KERNEL_LUT_LINEAR_RSQRT: + result = linear_rsqrt_eval(data, lut_param); + break; + case VSI_NN_KERNEL_LUT_LINEAR_SIGMOID: + result = linear_sigmoid_eval(data, lut_param); + break; default: VSILOGE( "unsupported activation function:%d", lut_param->act_type ); break; diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c index 89c8fa4..2f9be49 100644 --- a/src/tim/vx/internal/src/kernel/vx/convolutional.c +++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c @@ -43,7 +43,8 @@ static vsi_bool _build_vx_conv2d_param int32_t dilation_h, int32_t dilation_w, int32_t multiplier, vsi_enum overflow_policy, vsi_enum rounding_policy, - vsi_enum down_scale_size_rounding + vsi_enum down_scale_size_rounding, + vsi_enum pad_mode ) { vx_nn_convolution_params_ext_t * p1 = NULL; @@ -78,6 +79,7 @@ static vsi_bool _build_vx_conv2d_param p1->khr.down_scale_size_rounding = (vx_enum)down_scale_size_rounding; p1->padding_x_right = (uint32_t)pad_w_end; p1->padding_y_bottom = (uint32_t)pad_h_end; + p1->pad_mode = (vx_enum)pad_mode; param->depth_multiplier = multiplier; param->stride_x = (uint32_t)stride_w; param->stride_y = (uint32_t)stride_h; @@ -131,7 +133,8 @@ static vsi_bool _build_vx_conv3d_param int32_t dilation_d, int32_t dilation_h, int32_t dilation_w, int32_t multiplier, vsi_enum overflow_policy, vsi_enum rounding_policy, - vsi_enum down_scale_size_rounding + vsi_enum down_scale_size_rounding, + vsi_enum pad_mode ) { VSI_ASSERT( stride_d > 0 ); @@ -176,6 +179,7 @@ static vsi_bool _build_vx_conv3d_param param->stride_w = (uint32_t)stride_w; param->stride_h = (uint32_t)stride_h; param->stride_d = (uint32_t)stride_d; + param->pad_mode = (vx_enum)pad_mode; return TRUE; } /* _build_vx_conv2d_param() */ @@ -299,7 +303,8 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) 0, vsi_nn_kernel_param_get_int32(params, "overflow_policy"), vsi_nn_kernel_param_get_int32(params, "rounding_policy"), - vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"), + vsi_nn_kernel_param_get_int32(params, "pad_mode") ); temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, @@ -374,7 +379,8 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) vsi_nn_kernel_param_get_int32(params, "multiplier"), vsi_nn_kernel_param_get_int32(params, "overflow_policy"), vsi_nn_kernel_param_get_int32(params, "rounding_policy"), - vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"), + vsi_nn_kernel_param_get_int32(params, "pad_mode") ); temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, @@ -493,7 +499,8 @@ REGISTER_CONV_OPENVX_KERNEL( conv2d ) 0, vsi_nn_kernel_param_get_int32(params, "overflow_policy"), vsi_nn_kernel_param_get_int32(params, "rounding_policy"), - vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"), + vsi_nn_kernel_param_get_int32(params, "pad_mode") ); node = vxConvolutionLayer( graph->g, @@ -524,7 +531,8 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv2d ) vsi_nn_kernel_param_get_int32(params, "multiplier"), vsi_nn_kernel_param_get_int32(params, "overflow_policy"), vsi_nn_kernel_param_get_int32(params, "rounding_policy"), - vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"), + vsi_nn_kernel_param_get_int32(params, "pad_mode") ); node = vxConvolutionLayer( graph->g, @@ -606,7 +614,8 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d ) vsi_nn_kernel_param_get_int32(params, "depth_multiplier"), vsi_nn_kernel_param_get_int32(params, "overflow_policy"), vsi_nn_kernel_param_get_int32(params, "rounding_policy"), - vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"), + vsi_nn_kernel_param_get_int32(params, "pad_mode") ); node = vxConv3dLayer( graph->g, diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c index c6edaaa..fffb3aa 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -269,4 +269,84 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( tanh ) return (vsi_nn_kernel_node_t)node; } /* tanh() */ +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu1 ) +{ + vx_node node = NULL; + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* relu1() */ + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu6 ) +{ + vx_node node = NULL; + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* relu6() */ + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( rsqrt ) +{ + vx_node node = NULL; + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RSQRT, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* rsqrt() */ + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sqrt ) +{ + vx_node node = NULL; + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQRT, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* sqrt() */ + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( softrelu ) +{ + vx_node node = NULL; + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTRELU, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* softrelu() */ + #undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c index d67751b..a458e38 100644 --- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c @@ -65,6 +65,7 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 ) int32_t pad_front_array[VSI_NN_MAX_DIM_NUM] = {0}; int32_t pad_back_array[VSI_NN_MAX_DIM_NUM] = {0}; vsi_nn_tensor_t *convert_tensor = NULL; + vsi_bool release_intermediate_tensor = TRUE; float const_val = vsi_nn_kernel_param_get_float32(params, "const_val"); memset(¶m, 0, sizeof(param)); @@ -98,14 +99,18 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 ) } else { - convert_tensor = vsi_nn_reshape_tensor( graph, - inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num ); + convert_tensor = inputs[0]; + release_intermediate_tensor = FALSE; } node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, ¶m, sizeof(param) ); vxReleaseScalar( ¶m.pad_const ); - vsi_safe_release_tensor(convert_tensor); + + if (release_intermediate_tensor) + { + vsi_safe_release_tensor(convert_tensor); + } return (vsi_nn_kernel_node_t)node; } /* pad2() */ diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/bucketize.cl b/src/tim/vx/internal/src/libnnext/ops/cl/bucketize.cl new file mode 100644 index 0000000..4b7ab04 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/bucketize.cl @@ -0,0 +1,281 @@ +#pragma OPENCL EXTENSION CL_VIV_asm : enable + +#define BUCKETIZE_F32_2D_SH_IMPL(name, comp_op) \ +__kernel void bucketize_##name \ + ( \ + __read_only image2d_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_t output, \ + int boundaries_size, \ + float input0_scale, \ + float input0_tail, \ + float input1_scale, \ + float input1_tail \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + float4 src0 = read_imagef(input, coord); \ + \ + int2 pos = 0; \ + do \ + { \ + float4 src1 = read_imagef(boundaries, pos); \ + if ((src0.x) comp_op (src1.x)) \ + { \ + break; \ + } \ + pos.x ++; \ + } while(pos.x < boundaries_size); \ + \ + write_imagei(output, coord, pos.xxxx); \ +} +BUCKETIZE_F32_2D_SH_IMPL(F32_F32toI32_2D, <=) +BUCKETIZE_F32_2D_SH_IMPL(right_F32_F32toI32_2D, <) + +#define BUCKETIZE_F32_SH_IMPL(name, comp_op) \ +__kernel void bucketize_##name \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_array_t output, \ + int boundaries_size, \ + float input0_scale, \ + float input0_tail, \ + float input1_scale, \ + float input1_tail \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + float4 src0 = read_imagef(input, coord); \ + \ + int2 pos = 0; \ + do \ + { \ + float4 src1 = read_imagef(boundaries, pos); \ + if ((src0.x) comp_op (src1.x)) \ + { \ + break; \ + } \ + pos.x ++; \ + } while(pos.x < boundaries_size); \ + \ + write_imagei(output, coord, pos.xxxx); \ +} +BUCKETIZE_F32_SH_IMPL(F32_F32toI32, <=) +BUCKETIZE_F32_SH_IMPL(right_F32_F32toI32, <) + +#define BUCKETIZE_I32_2D_SH_IMPL(name, comp_op) \ +__kernel void bucketize_##name \ + ( \ + __read_only image2d_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_t output, \ + int boundaries_size, \ + float input0_scale, \ + float input0_tail, \ + float input1_scale, \ + float input1_tail \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + float4 src0 = convert_float4(read_imagei(input, coord)); \ + \ + int2 pos = 0; \ + src0 = src0 * input0_scale + input0_tail; \ + do \ + { \ + float4 src1 = convert_float4(read_imagei(boundaries, pos)); \ + src1 = src1 * input1_scale + input1_tail; \ + if ((src0.x) comp_op (src1.x)) \ + { \ + break; \ + } \ + pos.x ++; \ + } while(pos.x < boundaries_size); \ + \ + write_imagei(output, coord, pos.xxxx); \ +} +BUCKETIZE_I32_2D_SH_IMPL(I32_I32toI32_2D, <=) +BUCKETIZE_I32_2D_SH_IMPL(right_I32_I32toI32_2D, <) + +#define BUCKETIZE_I32_SH_IMPL(name, comp_op) \ +__kernel void bucketize_##name \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_array_t output, \ + int boundaries_size, \ + float input0_scale, \ + float input0_tail, \ + float input1_scale, \ + float input1_tail \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + int4 data = read_imagei(input, coord); \ + float4 src0 = convert_float4(data) * input0_scale + input0_tail; \ + \ + int2 pos = 0; \ + do \ + { \ + float4 src1 = convert_float4(read_imagei(boundaries, pos)); \ + src1 = src1 * input1_scale + input1_tail; \ + if ((src0.x) comp_op (src1.x)) \ + { \ + break; \ + } \ + pos.x ++; \ + } while(pos.x < boundaries_size); \ + \ + write_imagei(output, coord, pos.xxxx); \ +} +BUCKETIZE_I32_SH_IMPL(I32_I32toI32, <=) +BUCKETIZE_I32_SH_IMPL(right_I32_I32toI32, <) + +#define BUCKETIZE_U32_2D_SH_IMPL(name, comp_op) \ +__kernel void bucketize_##name \ + ( \ + __read_only image2d_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_t output, \ + int boundaries_size, \ + float input0_scale, \ + float input0_tail, \ + float input1_scale, \ + float input1_tail \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + float4 src0 = convert_float4(read_imageui(input, coord)); \ + \ + int2 pos = 0; \ + src0 = src0 * input0_scale + input0_tail; \ + do \ + { \ + float4 src1 = convert_float4(read_imageui(boundaries, pos)); \ + src1 = src1 * input1_scale + input1_tail; \ + if ((src0.x) comp_op (src1.x)) \ + { \ + break; \ + } \ + pos.x ++; \ + } while(pos.x < boundaries_size); \ + \ + write_imagei(output, coord, pos.xxxx); \ +} +BUCKETIZE_U32_2D_SH_IMPL(U32_U32toI32_2D, <=) +BUCKETIZE_U32_2D_SH_IMPL(right_U32_U32toI32_2D, <) + +#define BUCKETIZE_U32_SH_IMPL(name, comp_op) \ +__kernel void bucketize_##name \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_array_t output, \ + int boundaries_size, \ + float input0_scale, \ + float input0_tail, \ + float input1_scale, \ + float input1_tail \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + uint4 data = read_imageui(input, coord); \ + float4 src0 = convert_float4(data) * input0_scale + input0_tail; \ + \ + int2 pos = 0; \ + do \ + { \ + float4 src1 = convert_float4(read_imageui(boundaries, pos)); \ + src1 = src1 * input1_scale + input1_tail; \ + if ((src0.x) comp_op (src1.x)) \ + { \ + break; \ + } \ + pos.x ++; \ + } while(pos.x < boundaries_size); \ + \ + write_imagei(output, coord, pos.xxxx); \ +} +BUCKETIZE_U32_SH_IMPL(U32_U32toI32, <=) +BUCKETIZE_U32_SH_IMPL(right_U32_U32toI32, <) + +#define BUCKETIZE_BF16_2D_SH_IMPL(name, comp_op) \ +__kernel void bucketize_##name \ + ( \ + __read_only image2d_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_t output, \ + int boundaries_size, \ + float input0_scale, \ + float input0_tail, \ + float input1_scale, \ + float input1_tail \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + uint4 data0 = read_imageui(input, coord) << 16; \ + float4 src0; \ + _viv_asm(COPY, src0, data0, 16); \ + \ + int2 pos = 0; \ + do \ + { \ + uint4 data1 = read_imageui(boundaries, pos) << 16; \ + float4 src1; \ + _viv_asm(COPY, src1, data1, 16); \ + if ((src0.x) comp_op (src1.x)) \ + { \ + break; \ + } \ + pos.x ++; \ + } while(pos.x < boundaries_size); \ + \ + write_imagei(output, coord, pos.xxxx); \ +} +BUCKETIZE_BF16_2D_SH_IMPL(BF16_BF16toI32_2D, <=) +BUCKETIZE_BF16_2D_SH_IMPL(right_BF16_BF16toI32_2D, <) + +#define BUCKETIZE_BF16_SH_IMPL(name, comp_op) \ +__kernel void bucketize_##name \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_array_t output, \ + int boundaries_size, \ + float input0_scale, \ + float input0_tail, \ + float input1_scale, \ + float input1_tail \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + uint4 data0 = read_imageui(input, coord) << 16; \ + float4 src0; \ + _viv_asm(COPY, src0, data0, 16); \ + \ + int2 pos = 0; \ + do \ + { \ + uint4 data1 = read_imageui(boundaries, pos) << 16; \ + float4 src1; \ + _viv_asm(COPY, src1, data1, 16); \ + if ((src0.x) comp_op (src1.x)) \ + { \ + break; \ + } \ + pos.x ++; \ + } while(pos.x < boundaries_size); \ + \ + write_imagei(output, coord, pos.xxxx); \ +} +BUCKETIZE_BF16_SH_IMPL(BF16_BF16toI32, <=) +BUCKETIZE_BF16_SH_IMPL(right_BF16_BF16toI32, <) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lppool.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lppool.cl new file mode 100644 index 0000000..64068c2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/lppool.cl @@ -0,0 +1,115 @@ + +#define LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type) \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int hstart = gidy * stride_y - pad_top; \ + int wstart = gidx * stride_x - pad_left; \ + int hend = min(hstart + ksize_y, height); \ + int wend = min(wstart + ksize_x, width); \ + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); \ + int4 coord_in = coord_out; \ + int h, w; \ + float sum_of_pow = 0; \ + dst_type out_data = (dst_type)(0); \ + src_type in_data; \ + float in_f32, out_f32; \ + hstart = max(hstart, 0); \ + wstart = max(wstart, 0); \ + for (h = hstart; h < hend; h++) \ + { \ + for (w = wstart; w < wend; w++) \ + { \ + coord_in.xy = (int2)(w, h); \ + in_data = readimage_type(input, coord_in).x; \ + in_f32 = convert_float(in_data) * inputScale + inputTail; \ + sum_of_pow += pow(fabs(in_f32),p); \ + } \ + } \ + out_f32 = pow(sum_of_pow, 1.0f / p) * outputScale + outputTail; \ + out_data.x = conv_mode(out_f32); \ + writeimage_type(output, coord_out, out_data); \ + +#define TENSOR_LPPOOL(src_name, dst_name, src_type, dst_type, readimage_type, conv_mode, writeimage_type) \ +__kernel void lppool_##src_name##to##dst_name ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int ksize_x, \ + int ksize_y, \ + int stride_x, \ + int stride_y, \ + int pad_left, \ + int pad_top, \ + int p, \ + int width, \ + int height, \ + float inputScale, \ + float inputTail, \ + float outputScale, \ + float outputTail) \ +{ \ + LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type); \ +} + +TENSOR_LPPOOL(F32, F32, float, float4, read_imagef, convert_float, write_imagef) +TENSOR_LPPOOL(F32, U32, float, uint4, read_imagef, convert_uint, write_imageui) +TENSOR_LPPOOL(F32, I32, float, int4, read_imagef, convert_int, write_imagei) + +TENSOR_LPPOOL(U32, U32, uint, uint4, read_imageui, convert_uint, write_imageui) +TENSOR_LPPOOL(U32, F32, uint, float4, read_imageui, convert_float, write_imagef) +TENSOR_LPPOOL(U32, I32, uint, int4, read_imageui, convert_int, write_imagei) + +TENSOR_LPPOOL(I32, I32, int, int4, read_imagei, convert_int, write_imagei) +TENSOR_LPPOOL(I32, F32, int, float4, read_imagei, convert_float, write_imagef) +TENSOR_LPPOOL(I32, U32, int, uint4, read_imagei, convert_uint, write_imageui) + +__kernel void lppool_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int ksize_x, + int ksize_y, + int stride_x, + int stride_y, + int pad_left, + int pad_top, + int p, + int width, + int height, + float inputScale, + float inputTail, + float outputScale, + float outputTail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int hstart = gidy * stride_y - pad_top; + int wstart = gidx * stride_x - pad_left; + int hend = min(hstart + ksize_y, height); + int wend = min(wstart + ksize_x, width); + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); + int4 coord_in = coord_out; + int h, w; + float sum_of_pow = 0; + float out_data_f32 = 0; + uint4 dst = (uint4)(0); + float4 data_f32 = (float4)(0); + uint4 data; + hstart = max(hstart, 0); + wstart = max(wstart, 0); + + for (h = hstart; h < hend; h++) + { + for (w = wstart; w < wend; w++) + { + coord_in.xy = (int2)(w, h); + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, data_f32, data, 16); + sum_of_pow += pow(abs(data_f32.x),p); + } + } + out_data_f32 = pow(sum_of_pow, 1.0f / p); + _viv_asm(COPY, dst, out_data_f32, 4); + dst.x = dst.x >> 16; + write_imageui(output, coord_out, dst); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl index 08e66a7..cf0b2b5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl @@ -124,7 +124,7 @@ __kernel void maximum_I32I32toI32 float4 data0 = convert_float4(src0) * input0Scale - input0Tail; float4 data1 = convert_float4(src1) * input1Scale - input1Tail; float4 data = data0 > data1 ? data0 : data1; - int4 dst = convert_int4(data * outputScale + outputZP); + int4 dst = convert_int4_rte(data * outputScale + outputZP); write_imagei(output, coord, dst); } @@ -150,7 +150,7 @@ __kernel void maximum_I32I32toI32_2D float4 data0 = convert_float4(src0) * input0Scale - input0Tail; float4 data1 = convert_float4(src1) * input1Scale - input1Tail; float4 data = data0 > data1 ? data0 : data1; - int4 dst = convert_int4(data * outputScale + outputZP); + int4 dst = convert_int4_rte(data * outputScale + outputZP); write_imagei(output, coord, dst); } diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl index 27c6501..f02044b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl @@ -124,7 +124,7 @@ __kernel void minimum_I32I32toI32 float4 data0 = convert_float4(src0) * input0Scale - input0Tail; float4 data1 = convert_float4(src1) * input1Scale - input1Tail; float4 data = data0 < data1 ? data0 : data1; - int4 dst = convert_int4(data * outputScale + outputZP); + int4 dst = convert_int4_rte(data * outputScale + outputZP); write_imagei(output, coord, dst); } @@ -150,7 +150,7 @@ __kernel void minimum_I32I32toI32_2D float4 data0 = convert_float4(src0) * input0Scale - input0Tail; float4 data1 = convert_float4(src1) * input1Scale - input1Tail; float4 data = data0 < data1 ? data0 : data1; - int4 dst = convert_int4(data * outputScale + outputZP); + int4 dst = convert_int4_rte(data * outputScale + outputZP); write_imagei(output, coord, dst); } diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl index b2d6aae..91b10d9 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl @@ -1,12 +1,14 @@ + inline float roi_align_1x1 ( __read_only image2d_array_t input, - float2 region_start, - float2 region_end, - float2 bin_size, - int2 grid_size, - float2 rcp_of_grid_size, - int pz + float2 region_start, + float2 region_end, + float2 bin_size, + int2 grid_size, + float2 rcp_of_grid_size, + int pz, + int4 max_spatial_dims ) { float sum = 0; @@ -21,15 +23,24 @@ inline float roi_align_1x1 int2 xy_low = convert_int2(pos); int2 xy_high = xy_low + 1; - float ly = pos.y - xy_low.y; - float lx = pos.x - xy_low.x; - float hy = 1.0f - ly; - float hx = 1.0f - lx; + if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 || + xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 ) + { + continue; + } + + float2 lxy = pos - floor(pos); + float2 zero = 0; + + lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy; + + float hy = 1.0f - lxy.y; + float hx = 1.0f - lxy.x; float w1 = hy * hx; - float w2 = hy * lx; - float w3 = ly * hx; - float w4 = ly * lx; + float w2 = lxy.x - lxy.x * lxy.y; + float w3 = lxy.y - lxy.x * lxy.y; + float w4 = lxy.y * lxy.x; float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x; float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x; @@ -43,8 +54,9 @@ inline float roi_align_1x1 return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y); } - #define EPS_GRID 0.00001f +#define TYPE_FLOAT16 (1) +#define TYPE_FLOAT32 (2) __kernel void roi_align_F32_F32toF32 ( __read_only image2d_array_t input, @@ -57,13 +69,14 @@ __kernel void roi_align_F32_F32toF32 float output_zp, float spatial_x_scale, float spatial_y_scale, - float in_width, - float in_height, + int in_width, + int in_height, float rcp_of_out_width, float rcp_of_out_height, float sampling_x_ratio, float sampling_y_ratio, - int depth + int depth, + int dtype ) { int px = get_global_id(0); @@ -82,7 +95,10 @@ __kernel void roi_align_F32_F32toF32 float2 spatial_indx = (float2)(px, py); float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height); - float2 max_spatial_dims = (float2)(in_width, in_height); + int4 max_spatial_dims = (int4)(in_width, in_height, in_width, in_height); + max_spatial_dims.zw = max_spatial_dims.zw - 1; + + float2 max_limiatation = convert_float2(max_spatial_dims.zw); float2 bin_size = roi_dims * pooled_dims; float2 region_start = spatial_indx * bin_size + roi_anchor.xy; @@ -105,9 +121,28 @@ __kernel void roi_align_F32_F32toF32 bin_size, grid_size_xy, rcp_of_grid_size, - kz); + kz, + max_spatial_dims); - write_imagef(output, (int4)(px, py, kz1, 0), interp); + if (dtype == TYPE_FLOAT16) + { + half tmp; + short dst; + _viv_asm(CONV, tmp, interp.x); + _viv_asm(COPY, dst, tmp, 2); + + Tensor out_t = create_tensor_from_image2d_array(output, 2); + short *output_ptr = (short *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0)); + + output_ptr[0] = dst; + } + else + { + Tensor out_t = create_tensor_from_image2d_array(output, 4); + float *output_ptr = (float *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0)); + + output_ptr[0] = interp.x; + } } } @@ -121,7 +156,8 @@ inline float roi_align_1x1_U8toF32 float2 bin_size, int2 grid_size, float2 rcp_of_grid_size, - int pz + int pz, + int4 max_spatial_dims ) { float sum = 0; @@ -132,33 +168,43 @@ inline float roi_align_1x1_U8toF32 { float2 ixy = (float2)(ix + 0.5f, iy + 0.5f); float2 pos = region_start + ixy * bin_size * rcp_of_grid_size; - + int2 xy_low = convert_int2(pos); int2 xy_high = xy_low + 1; - - float ly = pos.y - xy_low.y; - float lx = pos.x - xy_low.x; - float hy = 1.0f - ly; - float hx = 1.0f - lx; - + + float2 lxy = pos - floor(pos); + float2 zero = 0; + + if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 || + xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 ) + { + continue; + } + + lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy; + + float hy = 1.0f - lxy.y; + float hx = 1.0f - lxy.x; + float w1 = hy * hx; - float w2 = hy * lx; - float w3 = ly * hx; - float w4 = ly * lx; - + float w2 = lxy.x - lxy.x * lxy.y; + float w3 = lxy.y - lxy.x * lxy.y; + float w4 = lxy.y * lxy.x; + uint4 data; data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x; data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x; data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x; data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x; - + float4 value = convert_float4(data) * input_scale + input_tail; - + sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w; } } - + return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y); + } __kernel void roi_align_U8_U16toU8 @@ -173,13 +219,14 @@ __kernel void roi_align_U8_U16toU8 float output_zp, float spatial_x_scale, float spatial_y_scale, - float in_width, - float in_height, + int in_width, + int in_height, float rcp_of_out_width, float rcp_of_out_height, float sampling_x_ratio, float sampling_y_ratio, - int depth + int depth, + int dtype ) { int px = get_global_id(0); @@ -198,7 +245,10 @@ __kernel void roi_align_U8_U16toU8 float2 spatial_indx = (float2)(px, py); float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height); - float2 max_spatial_dims = (float2)(in_width, in_height); + int4 max_spatial_dims = (int4)(in_width, in_height, in_width, in_height); + max_spatial_dims.zw = max_spatial_dims.zw - 1; + + float2 max_limiatation = convert_float2(max_spatial_dims.zw); float2 bin_size = roi_dims * pooled_dims; float2 region_start = spatial_indx * bin_size + roi_anchor.xy; @@ -223,12 +273,17 @@ __kernel void roi_align_U8_U16toU8 bin_size, grid_size_xy, rcp_of_grid_size, - kz); + kz, + max_spatial_dims); - uint4 dst; + uchar dst; interp.x = interp.x * output_scale + output_zp; interp.x = interp.x < 255 ? interp.x : 255; - dst.x = convert_uint_rte(interp.x); - write_imageui(output, (int4)(px, py, kz1, 0), dst.xxxx); + dst = convert_uchar_rte(interp.x); + + Tensor out_t = create_tensor_from_image2d_array(output, 1); + uchar *output_ptr = (uchar *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0)); + + output_ptr[0] = dst; } } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements.cl new file mode 100644 index 0000000..bb148dd --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements.cl @@ -0,0 +1,298 @@ + +#define SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(name, dtype) \ +__kernel void scatter_elements_axis0_##name \ + ( \ + __read_only image2d_t ref, \ + __read_only image2d_t indices, \ + __read_only image2d_t update, \ + __write_only image2d_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + \ + Image ref_i = create_image_from_image2d(ref, 4); \ + Image update_i = create_image_from_image2d(update, 4); \ + Image indices_i = create_image_from_image2d(indices, 4); \ + Image output_i = create_image_from_image2d(output, 4); \ + \ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \ + dtype data = ref_ptr[0]; \ + if (coord.y < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \ + for(int x = 0; x < axis_size; x ++) \ + { \ + int offset = indices_ptr[x]; \ + if (offset == coord.x) \ + { \ + data = update_ptr[x]; \ + break; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float) +SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int) + +#define SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_axis0_##name \ + ( \ + __read_only image2d_t ref, \ + __read_only image2d_t indices, \ + __read_only image2d_t update, \ + __write_only image2d_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + \ + Image ref_i = create_image_from_image2d(ref, 2); \ + Image update_i = create_image_from_image2d(update, 2); \ + Image indices_i = create_image_from_image2d(indices, 4); \ + Image output_i = create_image_from_image2d(output, 2); \ + \ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.y < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \ + for(int x = 0; x < axis_size; x ++) \ + { \ + int offset = indices_ptr[x]; \ + if (offset == coord.x) \ + { \ + data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \ + break; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte) +SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(F16_I32_F16toF16, short, convert_short) +SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort) + +#define SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_axis0_##name \ + ( \ + __read_only image2d_t ref, \ + __read_only image2d_t indices, \ + __read_only image2d_t update, \ + __write_only image2d_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + \ + Image ref_i = create_image_from_image2d(ref, 1); \ + Image update_i = create_image_from_image2d(update, 1); \ + Image indices_i = create_image_from_image2d(indices, 4); \ + Image output_i = create_image_from_image2d(output, 1); \ + \ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.y < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \ + for(int x = 0; x < axis_size; x ++) \ + { \ + int offset = indices_ptr[x]; \ + if (offset == coord.x) \ + { \ + data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \ + break; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte) +SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char, convert_char) + +#define SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(name, dtype) \ +__kernel void scatter_elements_axis1_##name \ + ( \ + __read_only image2d_array_t ref, \ + __read_only image2d_array_t indices, \ + __read_only image2d_array_t update, \ + __write_only image2d_array_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \ + Tensor update_i = create_tensor_from_image2d_array(update, 4); \ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \ + Tensor output_i = create_tensor_from_image2d_array(output, 4); \ + \ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \ + dtype data = ref_ptr[0]; \ + if (coord.x < inner_size && coord.z < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \ + for(int y = 0; y < axis_size; y ++) \ + { \ + int offset = indices_ptr[y * inner_size]; \ + if (offset == coord.y) \ + { \ + data = update_ptr[y * inner_size]; \ + break; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float) +SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int) + +#define SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_axis1_##name \ + ( \ + __read_only image2d_array_t ref, \ + __read_only image2d_array_t indices, \ + __read_only image2d_array_t update, \ + __write_only image2d_array_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \ + Tensor update_i = create_tensor_from_image2d_array(update, 2); \ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \ + Tensor output_i = create_tensor_from_image2d_array(output, 2); \ + \ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.x < inner_size && coord.z < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \ + for(int y = 0; y < axis_size; y ++) \ + { \ + int offset = indices_ptr[y * inner_size]; \ + if (offset == coord.y) \ + { \ + data = conv_func(convert_float(update_ptr[y * inner_size]) \ + * update_scale + update_tail + output_zp); \ + break; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte) +SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(F16_I32_F16toF16, short, convert_short) +SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort) + +#define SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_axis1_##name \ + ( \ + __read_only image2d_array_t ref, \ + __read_only image2d_array_t indices, \ + __read_only image2d_array_t update, \ + __write_only image2d_array_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \ + Tensor update_i = create_tensor_from_image2d_array(update, 1); \ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \ + Tensor output_i = create_tensor_from_image2d_array(output, 1); \ + \ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.x < inner_size && coord.z < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \ + for(int y = 0; y < axis_size; y ++) \ + { \ + int offset = indices_ptr[y * inner_size]; \ + if (offset == coord.y) \ + { \ + data = conv_func(convert_float(update_ptr[y * inner_size]) \ + * update_scale + update_tail + output_zp); \ + break; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte) +SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char, convert_char) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_add.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_add.cl new file mode 100644 index 0000000..a7c67f5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_add.cl @@ -0,0 +1,292 @@ + +#define SE_ADD_AXIS0_32BITS_IMPL(name, dtype) \ +__kernel void scatter_elements_add_axis0_##name \ + ( \ + __read_only image2d_t ref, \ + __read_only image2d_t indices, \ + __read_only image2d_t update, \ + __write_only image2d_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + \ + Image ref_i = create_image_from_image2d(ref, 4); \ + Image update_i = create_image_from_image2d(update, 4); \ + Image indices_i = create_image_from_image2d(indices, 4); \ + Image output_i = create_image_from_image2d(output, 4); \ + \ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \ + dtype data = ref_ptr[0]; \ + if (coord.y < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \ + for(int x = 0; x < axis_size; x ++) \ + { \ + int offset = indices_ptr[x]; \ + if (offset == coord.x) \ + { \ + data += update_ptr[x]; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_ADD_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float) +SE_ADD_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int) + +#define SE_ADD_AXIS0_16BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_add_axis0_##name \ + ( \ + __read_only image2d_t ref, \ + __read_only image2d_t indices, \ + __read_only image2d_t update, \ + __write_only image2d_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + \ + Image ref_i = create_image_from_image2d(ref, 2); \ + Image update_i = create_image_from_image2d(update, 2); \ + Image indices_i = create_image_from_image2d(indices, 4); \ + Image output_i = create_image_from_image2d(output, 2); \ + \ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.y < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \ + for(int x = 0; x < axis_size; x ++) \ + { \ + int offset = indices_ptr[x]; \ + if (offset == coord.x) \ + { \ + data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_ADD_AXIS0_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte) +SE_ADD_AXIS0_16BITS_IMPL(F16_I32_F16toF16, short, convert_short) +SE_ADD_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort) + +#define SE_ADD_AXIS0_8BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_add_axis0_##name \ + ( \ + __read_only image2d_t ref, \ + __read_only image2d_t indices, \ + __read_only image2d_t update, \ + __write_only image2d_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + \ + Image ref_i = create_image_from_image2d(ref, 1); \ + Image update_i = create_image_from_image2d(update, 1); \ + Image indices_i = create_image_from_image2d(indices, 4); \ + Image output_i = create_image_from_image2d(output, 1); \ + \ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.y < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \ + for(int x = 0; x < axis_size; x ++) \ + { \ + int offset = indices_ptr[x]; \ + if (offset == coord.x) \ + { \ + data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_ADD_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte) +SE_ADD_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char, convert_char) + +#define SE_ADD_AXIS1_32BITS_IMPL(name, dtype) \ +__kernel void scatter_elements_add_axis1_##name \ + ( \ + __read_only image2d_array_t ref, \ + __read_only image2d_array_t indices, \ + __read_only image2d_array_t update, \ + __write_only image2d_array_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \ + Tensor update_i = create_tensor_from_image2d_array(update, 4); \ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \ + Tensor output_i = create_tensor_from_image2d_array(output, 4); \ + \ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \ + dtype data = ref_ptr[0]; \ + if (coord.x < inner_size && coord.z < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \ + for(int y = 0; y < axis_size; y ++) \ + { \ + int offset = indices_ptr[y * inner_size]; \ + if (offset == coord.y) \ + { \ + data += update_ptr[y * inner_size]; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_ADD_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float) +SE_ADD_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int) + +#define SE_ADD_AXIS1_16BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_add_axis1_##name \ + ( \ + __read_only image2d_array_t ref, \ + __read_only image2d_array_t indices, \ + __read_only image2d_array_t update, \ + __write_only image2d_array_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \ + Tensor update_i = create_tensor_from_image2d_array(update, 2); \ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \ + Tensor output_i = create_tensor_from_image2d_array(output, 2); \ + \ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.x < inner_size && coord.z < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \ + for(int y = 0; y < axis_size; y ++) \ + { \ + int offset = indices_ptr[y * inner_size]; \ + if (offset == coord.y) \ + { \ + data += conv_func(convert_float(update_ptr[y * inner_size]) \ + * update_scale + update_tail + output_zp); \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_ADD_AXIS1_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte) +SE_ADD_AXIS1_16BITS_IMPL(F16_I32_F16toF16, short, convert_short) +SE_ADD_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort) + +#define SE_ADD_AXIS1_8BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_add_axis1_##name \ + ( \ + __read_only image2d_array_t ref, \ + __read_only image2d_array_t indices, \ + __read_only image2d_array_t update, \ + __write_only image2d_array_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \ + Tensor update_i = create_tensor_from_image2d_array(update, 1); \ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \ + Tensor output_i = create_tensor_from_image2d_array(output, 1); \ + \ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.x < inner_size && coord.z < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \ + for(int y = 0; y < axis_size; y ++) \ + { \ + int offset = indices_ptr[y * inner_size]; \ + if (offset == coord.y) \ + { \ + data += conv_func(convert_float(update_ptr[y * inner_size]) \ + * update_scale + update_tail + output_zp); \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_ADD_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte) +SE_ADD_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char, convert_char) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_mul.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_mul.cl new file mode 100644 index 0000000..46e938c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_mul.cl @@ -0,0 +1,292 @@ + +#define SE_MUL_AXIS0_32BITS_IMPL(name, dtype) \ +__kernel void scatter_elements_mul_axis0_##name \ + ( \ + __read_only image2d_t ref, \ + __read_only image2d_t indices, \ + __read_only image2d_t update, \ + __write_only image2d_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + \ + Image ref_i = create_image_from_image2d(ref, 4); \ + Image update_i = create_image_from_image2d(update, 4); \ + Image indices_i = create_image_from_image2d(indices, 4); \ + Image output_i = create_image_from_image2d(output, 4); \ + \ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \ + dtype data = ref_ptr[0]; \ + if (coord.y < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \ + for(int x = 0; x < axis_size; x ++) \ + { \ + int offset = indices_ptr[x]; \ + if (offset == coord.x) \ + { \ + data *= update_ptr[x]; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_MUL_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float) +SE_MUL_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int) + +#define SE_MUL_AXIS0_16BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_mul_axis0_##name \ + ( \ + __read_only image2d_t ref, \ + __read_only image2d_t indices, \ + __read_only image2d_t update, \ + __write_only image2d_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + \ + Image ref_i = create_image_from_image2d(ref, 2); \ + Image update_i = create_image_from_image2d(update, 2); \ + Image indices_i = create_image_from_image2d(indices, 4); \ + Image output_i = create_image_from_image2d(output, 2); \ + \ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.y < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \ + for(int x = 0; x < axis_size; x ++) \ + { \ + int offset = indices_ptr[x]; \ + if (offset == coord.x) \ + { \ + data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_MUL_AXIS0_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte) +SE_MUL_AXIS0_16BITS_IMPL(F16_I32_F16toF16, short, convert_short) +SE_MUL_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort) + +#define SE_MUL_AXIS0_8BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_mul_axis0_##name \ + ( \ + __read_only image2d_t ref, \ + __read_only image2d_t indices, \ + __read_only image2d_t update, \ + __write_only image2d_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + \ + Image ref_i = create_image_from_image2d(ref, 1); \ + Image update_i = create_image_from_image2d(update, 1); \ + Image indices_i = create_image_from_image2d(indices, 4); \ + Image output_i = create_image_from_image2d(output, 1); \ + \ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.y < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \ + for(int x = 0; x < axis_size; x ++) \ + { \ + int offset = indices_ptr[x]; \ + if (offset == coord.x) \ + { \ + data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_MUL_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte) +SE_MUL_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char, convert_char) + +#define SE_MUL_AXIS1_32BITS_IMPL(name, dtype) \ +__kernel void scatter_elements_mul_axis1_##name \ + ( \ + __read_only image2d_array_t ref, \ + __read_only image2d_array_t indices, \ + __read_only image2d_array_t update, \ + __write_only image2d_array_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \ + Tensor update_i = create_tensor_from_image2d_array(update, 4); \ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \ + Tensor output_i = create_tensor_from_image2d_array(output, 4); \ + \ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \ + dtype data = ref_ptr[0]; \ + if (coord.x < inner_size && coord.z < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \ + for(int y = 0; y < axis_size; y ++) \ + { \ + int offset = indices_ptr[y * inner_size]; \ + if (offset == coord.y) \ + { \ + data *= update_ptr[y * inner_size]; \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_MUL_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float) +SE_MUL_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int) + +#define SE_MUL_AXIS1_16BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_mul_axis1_##name \ + ( \ + __read_only image2d_array_t ref, \ + __read_only image2d_array_t indices, \ + __read_only image2d_array_t update, \ + __write_only image2d_array_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \ + Tensor update_i = create_tensor_from_image2d_array(update, 2); \ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \ + Tensor output_i = create_tensor_from_image2d_array(output, 2); \ + \ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.x < inner_size && coord.z < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \ + for(int y = 0; y < axis_size; y ++) \ + { \ + int offset = indices_ptr[y * inner_size]; \ + if (offset == coord.y) \ + { \ + data *= conv_func(convert_float(update_ptr[y * inner_size]) \ + * update_scale + update_tail + output_zp); \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_MUL_AXIS1_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte) +SE_MUL_AXIS1_16BITS_IMPL(F16_I32_F16toF16, short, convert_short) +SE_MUL_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort) + +#define SE_MUL_AXIS1_8BITS_IMPL(name, dtype, conv_func) \ +__kernel void scatter_elements_mul_axis1_##name \ + ( \ + __read_only image2d_array_t ref, \ + __read_only image2d_array_t indices, \ + __read_only image2d_array_t update, \ + __write_only image2d_array_t output, \ + int axis, \ + int reduction, \ + float ref_scale, \ + float ref_tail, \ + float update_scale, \ + float update_tail, \ + float output_zp, \ + int inner_size, \ + int axis_size, \ + int outer_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \ + Tensor update_i = create_tensor_from_image2d_array(update, 1); \ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \ + Tensor output_i = create_tensor_from_image2d_array(output, 1); \ + \ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \ + if (coord.x < inner_size && coord.z < outer_size) \ + { \ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \ + for(int y = 0; y < axis_size; y ++) \ + { \ + int offset = indices_ptr[y * inner_size]; \ + if (offset == coord.y) \ + { \ + data *= conv_func(convert_float(update_ptr[y * inner_size]) \ + * update_scale + update_tail + output_zp); \ + } \ + } \ + } \ + \ + output_ptr[0] = data; \ +} +SE_MUL_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte) +SE_MUL_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char, convert_char) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bucketize.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bucketize.vx new file mode 100644 index 0000000..9b7e5c9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/bucketize.vx @@ -0,0 +1,176 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataConvert_0_4x4; +_viv_uniform VXC_512Bits uniDataConvert_1_4x4; +_viv_uniform int boundaries_size_x8; +_viv_uniform int boundaries_size; + +#define BUCKETIZE_16BITS_SH_IMPL(name, copy_type) \ +__kernel void bucketize_right_##name \ + ( \ + __read_only image2d_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_t output \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + \ + vxc_short8 data0, data1; \ + copy_type src0, src1, dst0, dst1; \ + vxc_ushort8 v0, v1, v2, v3, result = 0; \ + VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, data0, 16); \ + \ + for (; coord.z < boundaries_size_x8; ) \ + { \ + VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, data1.s00000000, 16); \ + coord.z += 8; \ + \ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, v0, dst0, 16); \ + v2 = sub_sat(v0, 0xFFFE); \ + _viv_asm(COPY, src1, data1.s11111111, 16); \ + VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, v1, dst1, 16); \ + v3 = sub_sat(v1, 0xFFFE); \ + \ + result = result + v2 + v3; \ + \ + _viv_asm(COPY, src1, data1.s22222222, 16); \ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, v0, dst0, 16); \ + v2 = sub_sat(v0, 0xFFFE); \ + _viv_asm(COPY, src1, data1.s33333333, 16); \ + VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, v1, dst1, 16); \ + v3 = sub_sat(v1, 0xFFFE); \ + \ + result = result + v2 + v3; \ + \ + _viv_asm(COPY, src1, data1.s44444444, 16); \ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, v0, dst0, 16); \ + v2 = sub_sat(v0, 0xFFFE); \ + _viv_asm(COPY, src1, data1.s55555555, 16); \ + VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, v1, dst1, 16); \ + v3 = sub_sat(v1, 0xFFFE); \ + \ + result = result + v2 + v3; \ + \ + _viv_asm(COPY, src1, data1.s66666666, 16); \ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, v0, dst0, 16); \ + v2 = sub_sat(v0, 0xFFFE); \ + _viv_asm(COPY, src1, data1.s77777777, 16); \ + VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, v1, dst1, 16); \ + v3 = sub_sat(v1, 0xFFFE); \ + \ + result = result + v2 + v3; \ + } \ + \ + for (; coord.z < boundaries_size; ) \ + { \ + VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, data1.s00000000, 16); \ + coord.z ++; \ + \ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, v0, dst0, 16); \ + v2 = sub_sat(v0, 0xFFFE); \ + \ + result = result + v2; \ + } \ + \ + int4 d0, d1; \ + VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \ + VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \ + coord.z = coord.x + 4; \ + \ + write_imagei(output, coord.xy, d0); \ + write_imagei(output, coord.zy, d1); \ +} +BUCKETIZE_16BITS_SH_IMPL(F16_F16toI32_2D, vxc_half8) +BUCKETIZE_16BITS_SH_IMPL(I16_I16toI32_2D, vxc_short8) + +#define BUCKETIZE_8BITS_SH_IMPL(name, src_type) \ +__kernel void bucketize_right_##name \ + ( \ + __read_only image2d_t input, \ + __read_only image2d_t boundaries, \ + __write_only image2d_t output \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + \ + src_type src0, src1, src2; \ + vxc_uchar8 dst0, dst1, result = 0; \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + for (; coord.z < boundaries_size_x8; ) \ + { \ + VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z += 8; \ + \ + VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, dst0, src2, 8); \ + dst0 = sub_sat(dst0, 0xFE); \ + VXC_Clamp(src2, src0, src1.s11111111, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, dst1, src2, 8); \ + dst1 = sub_sat(dst1, 0xFE); \ + \ + result = result + dst0 + dst1; \ + \ + VXC_Clamp(src2, src0, src1.s22222222, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, dst0, src2, 8); \ + dst0 = sub_sat(dst0, 0xFE); \ + VXC_Clamp(src2, src0, src1.s33333333, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, dst1, src2, 8); \ + dst1 = sub_sat(dst1, 0xFE); \ + \ + result = result + dst0 + dst1; \ + \ + VXC_Clamp(src2, src0, src1.s44444444, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, dst0, src2, 8); \ + dst0 = sub_sat(dst0, 0xFE); \ + VXC_Clamp(src2, src0, src1.s55555555, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, dst1, src2, 8); \ + dst1 = sub_sat(dst1, 0xFE); \ + \ + result = result + dst0 + dst1; \ + \ + VXC_Clamp(src2, src0, src1.s66666666, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, dst0, src2, 8); \ + dst0 = sub_sat(dst0, 0xFE); \ + VXC_Clamp(src2, src0, src1.s77777777, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, dst1, src2, 8); \ + dst1 = sub_sat(dst1, 0xFE); \ + \ + result = result + dst0 + dst1; \ + } \ + \ + for (; coord.z < boundaries_size; ) \ + { \ + VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z ++; \ + \ + VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \ + _viv_asm(COPY, dst0, src2, 8); \ + dst0 = sub_sat(dst0, 0xFE); \ + \ + result = result + dst0; \ + } \ + \ + int4 d0, d1; \ + VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \ + VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \ + coord.z = coord.x + 4; \ + \ + write_imagei(output, coord.xy, d0); \ + write_imagei(output, coord.zy, d1); \ +} +BUCKETIZE_8BITS_SH_IMPL(U8_U8toI32_2D, vxc_uchar8) +BUCKETIZE_8BITS_SH_IMPL(I8_I8toI32_2D, vxc_char8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx index 120e37e..3562ae5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx @@ -98,7 +98,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ half4 tmpVal0, tmpVal1; \ float alpha = scale_vari; \ - float alpha = scale_vari * input_scale; \ + alpha = scale_vari * input_scale; \ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \ bias_val = bias_val - input_zp * alpha; \ \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx index 1644ecd..4b4bf87 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx @@ -1,20 +1,14 @@ #include "cl_viv_vx_ext.h" _viv_uniform int width; -_viv_uniform int height; -_viv_uniform float inv_multiplier; -_viv_uniform int group_num; _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniSum_X_X2_16x2; _viv_uniform float input_scale; _viv_uniform float input_scale2; -_viv_uniform float input_zp; _viv_uniform float sum_x_tail; _viv_uniform float sum_x2_tail0; _viv_uniform float sum_x2_tail1; -_viv_uniform float output_scale; -_viv_uniform float output_zp; _viv_uniform VXC_512Bits uniSumX_16x1; _viv_uniform VXC_512Bits uniSumX2_16x1; @@ -23,7 +17,7 @@ _viv_uniform VXC_512Bits uniSumX2_16x1; __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + float eps, int height) \ { \ int gidx = get_global_id(0) << 4; \ int lidx = get_local_id(0); \ @@ -81,7 +75,7 @@ INSTANCE_NORM_SUMS_8BITS_IMPL(I8, vxc_char16) __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + float eps, int height) \ { \ int gidx = get_global_id(0) << 4; \ int lidx = get_local_id(0); \ @@ -134,18 +128,62 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums INSTANCE_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16) INSTANCE_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16) +__kernel void instance_norm_means +( + __read_only image2d_t sums, + __read_only image2d_t bias, + __read_only image2d_t scale, + __write_only image2d_t means, + float eps, + float in_time_out_scale, + float input_zp, + float output_scale, + float output_zp, + float inv_multiplier, + int group_num +) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + Image sums_img = create_image_from_image2d(sums, 4); + float4 *sums_ptr = (float4 *)get_image_ptr_from_coord(sums_img, coord); + + float alpha = read_imagef(scale, coord).x; + float beta = read_imagef(bias, coord).x; + + float4 mean_var = sums_ptr[0]; + for(int i = 1; i < group_num;) + { + mean_var += sums_ptr[i]; + i ++; + } + + mean_var *= inv_multiplier; + mean_var.s1 = mean_var.s1 - mean_var.s0 * mean_var.s0 + eps; + mean_var.s1 = rsqrt(mean_var.s1); + + alpha = alpha * mean_var.y; + + float4 dst; + dst.x = in_time_out_scale * alpha; + beta = (beta - alpha * mean_var.x) * output_scale + output_zp; + dst.y = beta - input_zp * dst.x; + + Image means_img = create_image_from_image2d(means, 4); + float4 *means_ptr = (float4 *)get_image_ptr_from_coord(means_img, coord); + means_ptr[0] = dst.xyxy; +} + _viv_uniform VXC_512Bits uniDataToFP32_0_4x4; _viv_uniform VXC_512Bits uniDataToFP32_1_4x4; _viv_uniform VXC_512Bits uniDataToFP32_2_4x4; _viv_uniform VXC_512Bits uniDataToFP32_3_4x4; #define INSTANCE_NORM_8BITS_IMPL(name, src_type, dst_type) \ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \ +__kernel void instance_norm_##name( \ __read_only image2d_array_t input, \ - __read_only image2d_t bias, \ - __read_only image2d_t scale, \ - __read_only image2d_t meanVari, \ + __read_only image2d_t means, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + int height) \ { \ int gidz = get_global_id(1); \ int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ @@ -153,26 +191,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na int2 coord_para = (int2)(0, gidz); \ src_type src0; \ dst_type dst; \ - float scale_vari, bias_val; \ - float4 bias_f, scale_f, mean_vari = (float4)(0); \ + float4 coef; \ \ - scale_f = read_imagef(scale, coord_para); \ - bias_f = read_imagef(bias, coord_para); \ - for(int i = 0; i < group_num; i++) \ - { \ - mean_vari += read_imagef(meanVari, coord_para); \ - coord_para.x += 4; \ - } \ - mean_vari *= inv_multiplier; \ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ - mean_vari.s1 = rsqrt(mean_vari.s1); \ - \ - scale_vari = scale_f.s0 * mean_vari.s1; \ - vxc_int4 tmpVal0, tmpVal1; \ + coef = read_imagef(means, coord_para); \ + int4 tmpVal0, tmpVal1; \ float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ - float alpha = input_scale * output_scale * scale_vari; \ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ - bias_val = bias_val - input_zp * alpha; \ \ int8 input_desc, output_desc; \ _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ @@ -191,14 +214,14 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ - norm = tmpData0 * alpha + bias_val; \ + norm = tmpData0 * coef.x + coef.y; \ tmpVal0 = convert_int4_rte(norm); \ - norm = tmpData1 * alpha + bias_val; \ + norm = tmpData1 * coef.x + coef.y; \ tmpVal1 = convert_int4_rte(norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ - norm = tmpData2 * alpha + bias_val; \ + norm = tmpData2 * coef.x + coef.y; \ tmpVal0 = convert_int4_rte(norm); \ - norm = tmpData3 * alpha + bias_val; \ + norm = tmpData3 * coef.x + coef.y; \ tmpVal1 = convert_int4_rte(norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ VXC_OP4_NoDest(img_store_3d, output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ @@ -208,60 +231,46 @@ INSTANCE_NORM_8BITS_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16) INSTANCE_NORM_8BITS_IMPL(I8_F32toI8, vxc_char16, vxc_char16) #define INSTANCE_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \ +__kernel void instance_norm_##name##_2D( \ __read_only image2d_array_t input, \ - __read_only image2d_t bias, \ - __read_only image2d_t scale, \ - __read_only image2d_t meanVari, \ + __read_only image2d_t means, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + int height) \ { \ int gidz = get_global_id(1); \ int gidy = gidz * height; \ - int2 coord = (int2)(get_global_id(0), gidy); \ + int4 coord; \ int2 coord_para = (int2)(0, gidz); \ int endH = gidy + height; \ src_type src0; \ dst_type dst; \ - float scale_vari, bias_val; \ - float4 bias_f, scale_f, mean_vari = (float4)(0); \ + float4 coef; \ \ - scale_f = read_imagef(scale, coord_para); \ - bias_f = read_imagef(bias, coord_para); \ - for(int i = 0; i < group_num; i++) \ - { \ - mean_vari += read_imagef(meanVari, coord_para); \ - coord_para.x += 4; \ - } \ - mean_vari *= inv_multiplier; \ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ - mean_vari.s1 = rsqrt(mean_vari.s1); \ - \ - scale_vari = scale_f.s0 * mean_vari.s1; \ - vxc_int4 tmpVal0, tmpVal1; \ + coef = read_imagef(means, coord_para); \ + int4 tmpVal0, tmpVal1; \ float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ - float alpha = input_scale * output_scale * scale_vari; \ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ - bias_val = bias_val - input_zp * alpha; \ \ - for(; coord.y < endH; coord.y++) \ + coord = (int4)(get_global_id(0), gidy, gidy - 1, gidy - 1); \ + \ + for(; coord.y < endH; ) \ { \ - VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.yz++; \ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ - norm = tmpData0 * alpha + bias_val; \ + norm = tmpData0 * coef.x + coef.y; \ tmpVal0 = convert_int4_rte(norm); \ - norm = tmpData1 * alpha + bias_val; \ + norm = tmpData1 * coef.x + coef.y; \ tmpVal1 = convert_int4_rte(norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ - norm = tmpData2 * alpha + bias_val; \ + norm = tmpData2 * coef.x + coef.y; \ tmpVal0 = convert_int4_rte(norm); \ - norm = tmpData3 * alpha + bias_val; \ + norm = tmpData3 * coef.x + coef.y; \ tmpVal1 = convert_int4_rte(norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord.xz, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ } \ } INSTANCE_NORM_8BITS_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx index 82d1704..322dac5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx @@ -1,11 +1,5 @@ #include "cl_viv_vx_ext.h" -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform float inv_multiplier; -_viv_uniform int group_num; -_viv_uniform float input_scale; -_viv_uniform float input_zp; _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniDataToFP32_0_4x4; @@ -14,13 +8,11 @@ _viv_uniform VXC_512Bits uniDataToFP32_2_4x4; _viv_uniform VXC_512Bits uniDataToFP32_3_4x4; #define INSTANCE_NORM_8_TO_F16_IMPL(name, src_type) \ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \ +__kernel void instance_norm_##name( \ __read_only image2d_array_t input, \ - __read_only image2d_t bias, \ - __read_only image2d_t scale, \ - __read_only image2d_t meanVari, \ + __read_only image2d_t means, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + int height) \ { \ int gidz = get_global_id(1); \ int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ @@ -28,25 +20,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na src_type src0; \ vxc_short8 outval; \ vxc_half8 dst; \ - float scale_vari, bias_val; \ - float4 bias_f, scale_f, mean_vari = (float4)(0); \ + float4 coef; \ \ - scale_f = read_imagef(scale, coord_para.xy); \ - bias_f = read_imagef(bias, coord_para.xy); \ - for(int i = 0; i < group_num; i++) \ - { \ - mean_vari += read_imagef(meanVari, coord_para.xy); \ - coord_para.x += 4; \ - } \ - mean_vari *= inv_multiplier; \ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ - mean_vari.s1 = rsqrt(mean_vari.s1); \ - scale_vari = scale_f.s0 * mean_vari.s1; \ + coef = read_imagef(means, coord_para.xy); \ float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ half4 tmpVal0, tmpVal1; \ - float alpha = scale_vari * input_scale; \ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \ - bias_val = bias_val - input_zp * alpha; \ \ coord_para = coord; \ int8 input_desc, output_desc; \ @@ -67,17 +45,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ - norm = alpha * tmpData0 + bias_val; \ + norm = tmpData0 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal0, norm); \ - norm = alpha * tmpData1 + bias_val; \ + norm = tmpData1 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal1, norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, outval, dst, 16); \ VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ coord_para.x += 8; \ - norm = alpha * tmpData2 + bias_val; \ + norm = tmpData2 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal0, norm); \ - norm = alpha * tmpData3 + bias_val; \ + norm = tmpData3 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal1, norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, outval, dst, 16); \ @@ -88,13 +66,11 @@ INSTANCE_NORM_8_TO_F16_IMPL(U8_F32toF16, vxc_uchar16) INSTANCE_NORM_8_TO_F16_IMPL(I8_F32toF16, vxc_char16) #define INSTANCE_NORM_8_TO_F16_IMPL_2D(name, src_type) \ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \ +__kernel void instance_norm_##name##_2D( \ __read_only image2d_array_t input, \ - __read_only image2d_t bias, \ - __read_only image2d_t scale, \ - __read_only image2d_t meanVari, \ + __read_only image2d_t means, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + int height) \ { \ int gidz = get_global_id(1); \ int gidy = gidz * height; \ @@ -104,26 +80,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na src_type src0; \ vxc_short8 outval; \ vxc_half8 dst; \ - float scale_vari, bias_val; \ - float4 bias_f, scale_f, mean_vari = (float4)(0); \ + float4 coef; \ \ - scale_f = read_imagef(scale, coord_para.xy); \ - bias_f = read_imagef(bias, coord_para.xy); \ - for(int i = 0; i < group_num; i++) \ - { \ - mean_vari += read_imagef(meanVari, coord_para.xy); \ - coord_para.x += 4; \ - } \ - mean_vari *= inv_multiplier; \ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ - mean_vari.s1 = rsqrt(mean_vari.s1); \ - \ - scale_vari = scale_f.s0 * mean_vari.s1; \ + coef = read_imagef(means, coord_para.xy); \ float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ half4 tmpVal0, tmpVal1; \ - float alpha = scale_vari * input_scale; \ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \ - bias_val = bias_val - input_zp * alpha; \ for(; coord.y < endH;) \ { \ VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ @@ -133,17 +94,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ - norm = alpha * tmpData0 + bias_val; \ + norm = tmpData0 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal0, norm); \ - norm = alpha * tmpData1 + bias_val; \ + norm = tmpData1 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal1, norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, outval, dst, 16); \ VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ coord_para.x += 8; \ - norm = alpha * tmpData2 + bias_val; \ + norm = tmpData2 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal0, norm); \ - norm = alpha * tmpData3 + bias_val; \ + norm = tmpData3 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal1, norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, outval, dst, 16); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx index 75221f4..2289baf 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx @@ -1,28 +1,21 @@ #include "cl_viv_vx_ext.h" _viv_uniform int width; -_viv_uniform int height; -_viv_uniform float inv_multiplier; -_viv_uniform int group_num; _viv_uniform VXC_512Bits uniDataToFP32_0_4x4; _viv_uniform VXC_512Bits uniDataToFP32_1_4x4; _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniSum_X_X2_8x2; _viv_uniform float input_scale; _viv_uniform float input_scale2; -_viv_uniform float input_zp; _viv_uniform float sum_x_tail; _viv_uniform float sum_x2_tail0; _viv_uniform float sum_x2_tail1; -_viv_uniform float output_scale; -_viv_uniform float output_zp; - #define INSTANCE_NORM_SUMS_16BITS_IMPL(name, src_type) \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + float eps, int height) \ { \ int gidx = get_global_id(0) << 3; \ int lidx = get_local_id(0); \ @@ -87,7 +80,7 @@ INSTANCE_NORM_SUMS_16BITS_IMPL(I16, vxc_short8) __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + float eps, int height) \ { \ int gidx = get_global_id(0) << 3; \ int lidx = get_local_id(0); \ @@ -146,13 +139,11 @@ INSTANCE_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8) INSTANCE_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8) #define INSTANCE_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \ +__kernel void instance_norm_##name( \ __read_only image2d_array_t input, \ - __read_only image2d_t bias, \ - __read_only image2d_t scale, \ - __read_only image2d_t meanVari, \ + __read_only image2d_t means, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + int height) \ { \ int gidz = get_global_id(1); \ int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ @@ -160,28 +151,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na int4 coord_para = (int4)(0, gidz, 0, 0); \ vxc_short8 src0; \ src_type in_h; \ - float scale_vari, bias_val; \ - float4 bias_f, scale_f, mean_vari = (float4)(0); \ + float4 coef; \ \ - scale_f = read_imagef(scale, coord_para.xy); \ - bias_f = read_imagef(bias, coord_para.xy); \ + coef = read_imagef(means, coord_para.xy); \ \ - for(int i = 0; i < group_num; i++) \ - { \ - mean_vari += read_imagef(meanVari, coord_para.xy); \ - coord_para.x += 4; \ - } \ - mean_vari *= inv_multiplier; \ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ - mean_vari.s1 = rsqrt(mean_vari.s1); \ - \ - scale_vari = scale_f.s0 * mean_vari.s1; \ - float alpha = input_scale * output_scale * scale_vari; \ float4 tmpData0, tmpData1; \ copy_type outval; \ conv_type tmpVal0, tmpVal1; \ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ - bias_val = bias_val - input_zp * alpha; \ dst_type dst; \ \ int8 input_desc, output_desc; \ @@ -204,9 +180,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ \ float4 norm; \ - norm = alpha * tmpData0 + bias_val; \ + norm = tmpData0 * coef.x + coef.y; \ _viv_asm(CONV_RTE, tmpVal0, norm); \ - norm = alpha * tmpData1 + bias_val; \ + norm = tmpData1 * coef.x + coef.y; \ _viv_asm(CONV_RTE, tmpVal1, norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, outval, dst, 16); \ @@ -221,13 +197,11 @@ INSTANCE_NORM_16BITS_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4 INSTANCE_NORM_16BITS_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) #define INSTANCE_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \ +__kernel void instance_norm_##name##_2D( \ __read_only image2d_array_t input, \ - __read_only image2d_t bias, \ - __read_only image2d_t scale, \ - __read_only image2d_t meanVari, \ + __read_only image2d_t means, \ __write_only image2d_array_t output, \ - float eps, int rs_flag) \ + int height) \ { \ int gidz = get_global_id(1); \ int gidy = gidz * height; \ @@ -236,28 +210,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na int endH = gidy + height; \ vxc_short8 src0; \ src_type in_h; \ - float scale_vari, bias_val; \ - float4 bias_f, scale_f, mean_vari = (float4)(0); \ + float4 coef; \ \ - scale_f = read_imagef(scale, coord_para.xy); \ - bias_f = read_imagef(bias, coord_para.xy); \ + coef = read_imagef(means, coord_para.xy); \ \ - for(int i = 0; i < group_num; i++) \ - { \ - mean_vari += read_imagef(meanVari, coord_para.xy); \ - coord_para.x += 4; \ - } \ - mean_vari *= inv_multiplier; \ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ - mean_vari.s1 = rsqrt(mean_vari.s1); \ - \ - scale_vari = scale_f.s0 * mean_vari.s1; \ - float alpha = input_scale * output_scale * scale_vari; \ float4 tmpData0, tmpData1; \ copy_type outval; \ conv_type tmpVal0, tmpVal1; \ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ - bias_val = bias_val - input_zp * alpha; \ dst_type dst; \ \ for(; coord.y < endH; coord.y++) \ @@ -268,9 +227,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ float4 norm; \ - norm = alpha * tmpData0 + bias_val; \ + norm = tmpData0 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal0, norm); \ - norm = alpha * tmpData1 + bias_val; \ + norm = tmpData1 * coef.x + coef.y; \ _viv_asm(CONV, tmpVal1, norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, outval, dst, 16); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx index 19f335b..078cca3 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx @@ -1,15 +1,13 @@ #include "cl_viv_vx_ext.h" _viv_uniform int width; -_viv_uniform int height; -_viv_uniform float inv_multiplier; -_viv_uniform int group_num; + _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; _viv_uniform VXC_512Bits uniExtractOddData_2x8; __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16( - image2d_array_t input, image2d_array_t output, float eps, int rsFlg) + image2d_array_t input, image2d_array_t output, float eps, int height) { int gidx = get_global_id(0) << 3; int lidx = get_local_id(0); @@ -70,7 +68,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16_2D( - image2d_array_t input, image2d_array_t output, float eps, int rsFlg) + image2d_array_t input, image2d_array_t output, float eps, int height) { int gidx = get_global_id(0) << 3; int lidx = get_local_id(0); @@ -129,36 +127,21 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums } } -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16_F32toBF16( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) +__kernel void instance_norm_BF16_F32toBF16( + __read_only image2d_array_t input, + __read_only image2d_t means, + __write_only image2d_array_t output, + int height) { int gidz = get_global_id(1); int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); vxc_short8 src0, src1, src2; - float scale_vari, bias_val; - float4 mean_vari = (float4)(0); + float4 coef; - Image img3 = create_image_from_image2d(meanVari, 4); - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz)); - __global float4* vari_ptr = (__global float4*)sumVari_ptr; + coef = read_imagef(means, coord.yz); - float sval = read_imagef(scale, coord.yz).x; - float bval = read_imagef(bias, coord.yz).x; - - for(int i = 0; i < group_num; i++) - { - mean_vari += vari_ptr[i]; - } - - mean_vari *= inv_multiplier; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = sval * mean_vari.s1; float4 tmpData0, tmpData1; - bias_val = (bval - scale_vari * mean_vari.s0); int8 input_desc, output_desc; _viv_asm(COPY, input_desc, input, sizeof(input_desc)); @@ -171,6 +154,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 for(coord.y = 0; coord.y < height; coord.y++) { + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_in.y ++; @@ -182,9 +166,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 _viv_asm(COPY, tmpData1, src2, 16); float4 norm; - norm = scale_vari * tmpData0 + bias_val; + norm = tmpData0 * coef.x + coef.y; _viv_asm(COPY, src0, norm, 16); - norm = scale_vari * tmpData1 + bias_val; + norm = tmpData1 * coef.x + coef.y; _viv_asm(COPY, src1, norm, 16); VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); VXC_OP4_NoDest(img_store_3d, output, coord, src2, \ @@ -192,41 +176,27 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 } } -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) +__kernel void instance_norm_BF16_F32toBF16_2D( + __read_only image2d_array_t input, + __read_only image2d_t means, + __write_only image2d_array_t output, + int height) { int gidz = get_global_id(1); int gidy = gidz * height; int2 coord = (int2)(get_global_id(0), gidy); - int2 coord_para = (int2)(gidz, 0); + int2 coord_para = (int2)(0, gidz); int endH = gidy + height; vxc_short8 src0, src1, src2; - float scale_vari, bias_val; - float4 mean_vari = (float4)(0); + float4 coef; - Image img3 = create_image_from_image2d(meanVari, 4); - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); - __global float4* vari_ptr = (__global float4*)sumVari_ptr; + coef = read_imagef(means, coord_para); - float sval = read_imagef(scale, coord_para.yx).x; - float bval = read_imagef(bias, coord_para.yx).x; - - for(int i = 0; i < group_num; i++) - { - mean_vari += vari_ptr[i]; - } - - mean_vari *= inv_multiplier; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = sval * mean_vari.s1; float4 tmpData0, tmpData1; - bias_val = (bval - scale_vari * mean_vari.s0); for(; coord.y < endH; coord.y++) { + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), @@ -237,9 +207,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 _viv_asm(COPY, tmpData1, src2, 16); float4 norm; - norm = scale_vari * tmpData0 + bias_val; + norm = tmpData0 * coef.x + coef.y; _viv_asm(COPY, src0, norm, 16); - norm = scale_vari * tmpData1 + bias_val; + norm = tmpData1 * coef.x + coef.y; _viv_asm(COPY, src1, norm, 16); VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx index 95d9c87..672c61f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx @@ -150,7 +150,7 @@ _viv_uniform int inputZP; VXC_Vstore3(dst_ptr, 0, dst.s012); \ break; \ case 4: \ - VXC_Vstore4(dst_ptr, 0, dst.0123); \ + VXC_Vstore4(dst_ptr, 0, dst.s0123); \ break; \ case 5: \ VXC_Vstore2(dst_ptr, 0, dst.s01); \ @@ -165,7 +165,7 @@ _viv_uniform int inputZP; VXC_Vstore3(dst_ptr, 0, dst.s012); \ break; \ case 7: \ - VXC_Vstore4(dst_ptr, 0, dst.0123); \ + VXC_Vstore4(dst_ptr, 0, dst.s0123); \ dst.s012 = dst.s456; \ dst_ptr += 4; \ VXC_Vstore3(dst_ptr, 0, dst.s012); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx index b4db308..07ede71 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx @@ -10,6 +10,11 @@ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; _viv_uniform int ac2zero; _viv_uniform int bc2zero; +_viv_uniform VXC_512Bits uniI16MulI16SumtoI32_16x1; +_viv_uniform VXC_512Bits uniI16MulI16SumtoI32B_16x1; +_viv_uniform float inout_beta; +_viv_uniform float inout_scale; + #define GEMM_QINT_TO_QINT(src0_type_name, read_type) \ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ @@ -102,3 +107,139 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } GEMM_QINT_TO_QINT(I16, vxc_short8) + +__kernel void gemm_transb_I16I16toI16(image2d_array_t inputA, + image2d_array_t inputB, image2d_array_t output, + int transposeA, int transposeB, int adjointA, int adjointB, + uint M, uint K, uint N) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;) + { + vxc_short8 srcA0,srcA1,srcA2,srcA3; + vxc_short8 srcB0,srcB1,srcB2,srcB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 8; + coord_b.x += 8; + + vxc_int4 iVal; + vxc_float4 fpVal; + VXC_DP16x1(iVal, srcA0, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA0, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA0, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA0, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + sum0 = sum0 + fpVal * inout_scale + inout_beta; + + VXC_DP16x1(iVal, srcA1, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA1, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA1, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA1, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + sum1 = sum1 + fpVal * inout_scale + inout_beta; + + VXC_DP16x1(iVal, srcA2, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA2, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA2, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA2, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + sum2 = sum2 + fpVal * inout_scale + inout_beta; + + VXC_DP16x1(iVal, srcA3, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA3, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA3, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + VXC_DP16x1(iVal, srcA3, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32_16x1); + VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0), + uniI16MulI16SumtoI32B_16x1); + sum3 = sum3 + fpVal * inout_scale + inout_beta; + } + vxc_int4 tmpOut0, tmpOut1; + vxc_short8 valDst; + tmpOut0 = convert_int4_rte(sum0); + tmpOut1 = convert_int4_rte(sum1); + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + tmpOut0 = convert_int4_rte(sum2); + tmpOut1 = convert_int4_rte(sum3); + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx new file mode 100644 index 0000000..41f1c08 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx @@ -0,0 +1,86 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +_viv_uniform float outputScaleVar; +_viv_uniform float bMeanScaleVarZp; +_viv_uniform float gMeanScaleVarZp; +_viv_uniform float rMeanScaleVarZp; + +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; + +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; + +#define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ +__kernel void pre_process_nv12_copy_##name \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t uv_img, \ + __write_only image2d_array_t output, \ + global int* xRatio, \ + global int* yRatio, \ + global int* xOffset, \ + global int* yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int sy = gidy + (*yOffset); \ + int sx = gidx + (*xOffset); \ + int uvX = sx & 0xfffffffe; \ + int uvY = sy >> 1; \ + \ + vxc_uchar16 Y, UV; \ + \ + VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_char16 tmpUV; \ + short tmpVal = 128; \ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \ + \ + float4 tmpDstB, tmpDstG, tmpDstR; \ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \ + \ + conv_type result; \ + dst_type dst0; \ + save_type dst; \ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstB); \ + dstPos.z = bOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstG); \ + dstPos.z = 1; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstR); \ + dstPos.z = rOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +NV12_COPY_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8) +NV12_COPY_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8) +NV12_COPY_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16) +NV12_COPY_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx index 4c6f935..ac6ba3d 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx @@ -8,151 +8,195 @@ _viv_uniform float bMeanScaleVarZp; _viv_uniform float gMeanScaleVarZp; _viv_uniform float rMeanScaleVarZp; -_viv_uniform uint xrIntFloat_16; -_viv_uniform uint yrIntFloat_16; +_viv_uniform uint xrIntFloat_16; +_viv_uniform uint yrIntFloat_16; _viv_uniform VXC_512Bits uniConvertNV12toB_4x4; _viv_uniform VXC_512Bits uniConvertNV12toG_4x4; _viv_uniform VXC_512Bits uniConvertNV12toR_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +_viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; -__kernel void pre_process_nv12_scale_U8toI16( - __read_only image2d_t y_img, __read_only image2d_t uv_img, - __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - uint4 gidx = get_global_id(0); - uint gidy = get_global_id(1); - gidx += (uint4)(0, 1, 2, 3); +_viv_uniform VXC_512Bits uniCalculateYShift_2x8; +_viv_uniform VXC_512Bits uniCalculateUVShift_2x8; - uint dy = (gidy * yrIntFloat_16) >> 16; - uint4 dx = (gidx * xrIntFloat_16) >> 16; - int sy = convert_int(dy) + (*yOffset); - int4 sx = convert_int4(dx) + (*xOffset); - int4 uvX = sx & 0xfffffffe; - int uvY = sy >> 1; - - vxc_uchar16 Y, UV; - int2 coord = (int2)(sx.x, sy); - int2 coord_uv = (int2)(uvX.x, uvY); - - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.y; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.z; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.w; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.y; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.z; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.w; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - vxc_char16 tmpUV; - short tmpVal = 128; - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); - - float4 tmpDstB, tmpDstG, tmpDstR; - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); - - int4 result; - vxc_short8 dst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); - dstPos.z = bOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); - dstPos.z = 1; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); - dstPos.z = rOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +#define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ +__kernel void pre_process_nv12_scale_##name##_gq \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t uv_img, \ + __write_only image2d_array_t output, \ + global int* xRatio, \ + global int* yRatio, \ + global int* xOffset, \ + global int* yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + uint4 gidx = get_global_id(0); \ + uint gidy = get_global_id(1); \ + gidx += (uint4)(0, 1, 2, 3); \ + \ + uint dy = (gidy * yrIntFloat_16) >> 16; \ + uint4 dx = (gidx * xrIntFloat_16) >> 16; \ + int sy = convert_int(dy) + (*yOffset); \ + int4 sx = convert_int4(dx) + (*xOffset); \ + int4 uvX = sx & 0xfffffffe; \ + int uvY = sy >> 1; \ + \ + vxc_uchar16 Y, UV; \ + int2 coord = (int2)(sx.x, sy); \ + int2 coord_uv = (int2)(uvX.x, uvY); \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \ + vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \ + int4 offsetUV = uvX - uvX.x; \ + \ + vxc_ushort8 diffY, diffUV; \ + _viv_asm(COPY, diffY, sx, 16); \ + _viv_asm(COPY, diffUV, offsetUV, 16); \ + \ + vxc_ushort8 constData = 8; \ + VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniCalculateYShift_2x8); \ + VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniCalculateUVShift_2x8); \ + VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_char16 tmpUV; \ + short tmpVal = 128; \ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \ + \ + float4 tmpDstB, tmpDstG, tmpDstR; \ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \ + \ + conv_type result; \ + dst_type dst0; \ + save_type dst; \ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstB); \ + dstPos.z = bOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstG); \ + dstPos.z = 1; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstR); \ + dstPos.z = rOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } +NV12_OPT_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8) +NV12_OPT_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8) +NV12_OPT_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16) +NV12_OPT_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16) -__kernel void pre_process_nv12_scale_U8toF16( - __read_only image2d_t y_img, __read_only image2d_t uv_img, - __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - uint4 gidx = get_global_id(0); - uint gidy = get_global_id(1); - gidx += (uint4)(0, 1, 2, 3); - - uint dy = (gidy * yrIntFloat_16) >> 16; - uint4 dx = (gidx * xrIntFloat_16) >> 16; - int sy = convert_int(dy) + (*yOffset); - int4 sx = convert_int4(dx) + (*xOffset); - int4 uvX = sx & 0xfffffffe; - int uvY = sy >> 1; - - vxc_uchar16 Y, UV; - int2 coord = (int2)(sx.x, sy); - int2 coord_uv = (int2)(uvX.x, uvY); - - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.y; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.z; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.w; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.y; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.z; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.w; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - vxc_char16 tmpUV; - short tmpVal = 128; - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); - - float4 tmpDstB, tmpDstG, tmpDstR; - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); - - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; - - half4 result; - vxc_half8 tmpdst; - vxc_short8 dst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - _viv_asm(CONV, result, tmpDstB); - dstPos.z = bOrder; - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpdst, 16); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(CONV, result, tmpDstG); - dstPos.z = 1; - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpdst, 16); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(CONV, result, tmpDstR); - dstPos.z = rOrder; - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpdst, 16); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file +#define NV12_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ +__kernel void pre_process_nv12_scale_##name \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t uv_img, \ + __write_only image2d_array_t output, \ + global int* xRatio, \ + global int* yRatio, \ + global int* xOffset, \ + global int* yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + uint4 gidx = get_global_id(0); \ + uint gidy = get_global_id(1); \ + gidx += (uint4)(0, 1, 2, 3); \ + \ + uint dy = (gidy * yrIntFloat_16) >> 16; \ + uint4 dx = (gidx * xrIntFloat_16) >> 16; \ + int sy = convert_int(dy) + (*yOffset); \ + int4 sx = convert_int4(dx) + (*xOffset); \ + int4 uvX = sx & 0xfffffffe; \ + int uvY = sy >> 1; \ + \ + vxc_uchar16 Y, UV; \ + int2 coord = (int2)(sx.x, sy); \ + int2 coord_uv = (int2)(uvX.x, uvY); \ + \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord.x = sx.y; \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord.x = sx.z; \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord.x = sx.w; \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_uv.x = uvX.y; \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_uv.x = uvX.z; \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_uv.x = uvX.w; \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_char16 tmpUV; \ + short tmpVal = 128; \ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \ + \ + float4 tmpDstB, tmpDstG, tmpDstR; \ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \ + \ + conv_type result; \ + dst_type dst0; \ + save_type dst; \ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstB); \ + dstPos.z = bOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstG); \ + dstPos.z = 1; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstR); \ + dstPos.z = rOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +NV12_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8) +NV12_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8) +NV12_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16) +NV12_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx deleted file mode 100644 index c274c3c..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx +++ /dev/null @@ -1,197 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int bOrder; -_viv_uniform int rOrder; - -_viv_uniform float outputScaleVar; -_viv_uniform float bMeanScaleVarZp; -_viv_uniform float gMeanScaleVarZp; -_viv_uniform float rMeanScaleVarZp; - -_viv_uniform uint xrIntFloat_16; -_viv_uniform uint yrIntFloat_16; - -_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; -_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; -_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; - -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; -_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; - -__kernel void pre_process_nv12_scale_U8toU8( - __read_only image2d_t y_img, __read_only image2d_t uv_img, - __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - uint4 gidx = get_global_id(0); - uint gidy = get_global_id(1); - gidx += (uint4)(0, 1, 2, 3); - - uint dy = (gidy * yrIntFloat_16) >> 16; - uint4 dx = (gidx * xrIntFloat_16) >> 16; - int sy = convert_int(dy) + (*yOffset); - int4 sx = convert_int4(dx) + (*xOffset); - int4 uvX = sx & 0xfffffffe; - int uvY = sy >> 1; - - vxc_uchar16 Y, UV; - int2 coord = (int2)(sx.x, sy); - int2 coord_uv = (int2)(uvX.x, uvY); - - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.y; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.z; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.w; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.y; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.z; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.w; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - vxc_char16 tmpUV; - short tmpVal = 128; - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); - - float4 tmpDstB, tmpDstG, tmpDstR; - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); - - int4 result; - vxc_uchar8 dst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); - dstPos.z = bOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); - dstPos.z = 1; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); - dstPos.z = rOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pre_process_nv12_copy_U8toU8( - __read_only image2d_t y_img, __read_only image2d_t uv_img, - __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - int sy = gidy + (*yOffset); - int sx = gidx + (*xOffset); - int uvX = sx & 0xfffffffe; - int uvY = sy >> 1; - - vxc_uchar16 Y, UV; - - VXC_ReadImage(Y, y_img, (int2)(sx,sy), VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - vxc_char16 tmpUV; - short tmpVal = 128; - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); - - float4 tmpDstB, tmpDstG, tmpDstR; - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); - - int4 result; - vxc_uchar8 dst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); - dstPos.z = bOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); - dstPos.z = 1; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); - dstPos.z = rOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pre_process_nv12_scale_U8toI8( - __read_only image2d_t y_img, __read_only image2d_t uv_img, - __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - uint4 gidx = get_global_id(0); - uint gidy = get_global_id(1); - gidx += (uint4)(0, 1, 2, 3); - - uint dy = (gidy * yrIntFloat_16) >> 16; - uint4 dx = (gidx * xrIntFloat_16) >> 16; - int sy = convert_int(dy) + (*yOffset); - int4 sx = convert_int4(dx) + (*xOffset); - int4 uvX = sx & 0xfffffffe; - int uvY = sy >> 1; - - vxc_uchar16 Y, UV; - int2 coord = (int2)(sx.x, sy); - int2 coord_uv = (int2)(uvX.x, uvY); - - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.y; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.z; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - coord.x = sx.w; - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.y; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.z; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - coord_uv.x = uvX.w; - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - - vxc_char16 tmpUV; - short tmpVal = 128; - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); - - float4 tmpDstB, tmpDstG, tmpDstR; - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); - - int4 result; - vxc_char8 dst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); - dstPos.z = bOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); - dstPos.z = 1; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); - dstPos.z = rOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx deleted file mode 100644 index 0a4551f..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx +++ /dev/null @@ -1,162 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int bOrder; -_viv_uniform int rOrder; - -_viv_uniform float outputScaleVar; -_viv_uniform float bMeanScaleVarZp; -_viv_uniform float gMeanScaleVarZp; -_viv_uniform float rMeanScaleVarZp; - -_viv_uniform uint xrIntFloat_16; -_viv_uniform uint yrIntFloat_16; - -_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; -_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; -_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; - -_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; - -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; - -_viv_uniform VXC_512Bits uniCalculateYShift_2x8; -_viv_uniform VXC_512Bits uniCalculateUVShift_2x8; - -__kernel void pre_process_nv12_scale_U8toU8_gq( - __read_only image2d_t y_img, __read_only image2d_t uv_img, - __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - uint4 gidx = get_global_id(0); - uint gidy = get_global_id(1); - gidx += (uint4)(0, 1, 2, 3); - - uint dy = (gidy * yrIntFloat_16) >> 16; - uint4 dx = (gidx * xrIntFloat_16) >> 16; - int sy = convert_int(dy) + (*yOffset); - int4 sx = convert_int4(dx) + (*xOffset); - int4 uvX = sx & 0xfffffffe; - int uvY = sy >> 1; - - vxc_uchar16 Y, UV; - int2 coord = (int2)(sx.x, sy); - int2 coord_uv = (int2)(uvX.x, uvY); - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; - vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; - int4 offsetUV = uvX - uvX.x; - - vxc_ushort8 diffY, diffUV; - _viv_asm(COPY, diffY, sx, 16); - _viv_asm(COPY, diffUV, offsetUV, 16); - - vxc_ushort8 constData = 8; - VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8); - VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8); - VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - vxc_char16 tmpUV; - short tmpVal = 128; - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); - - float4 tmpDstB, tmpDstG, tmpDstR; - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); - - int4 result; - vxc_uchar8 dst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp); - dstPos.z = bOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp); - dstPos.z = 1; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp); - dstPos.z = rOrder; - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pre_process_nv12_scale_U8toF16_gq( - __read_only image2d_t y_img, __read_only image2d_t uv_img, - __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - uint4 gidx = get_global_id(0); - uint gidy = get_global_id(1); - gidx += (uint4)(0, 1, 2, 3); - - uint dy = (gidy * yrIntFloat_16) >> 16; - uint4 dx = (gidx * xrIntFloat_16) >> 16; - int sy = convert_int(dy) + (*yOffset); - int4 sx = convert_int4(dx) + (*xOffset); - int4 uvX = sx & 0xfffffffe; - int uvY = sy >> 1; - - vxc_uchar16 Y, UV; - int2 coord = (int2)(sx.x, sy); - int2 coord_uv = (int2)(uvX.x, uvY); - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; - vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; - int4 offsetUV = uvX - uvX.x; - - vxc_ushort8 diffY, diffUV; - _viv_asm(COPY, diffY, sx, 16); - _viv_asm(COPY, diffUV, offsetUV, 16); - - vxc_ushort8 constData = 8; - VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8); - VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8); - VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - vxc_char16 tmpUV; - short tmpVal = 128; - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); - - float4 tmpDstB, tmpDstG, tmpDstR; - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); - - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; - - half4 result; - vxc_half8 tmpdst; - vxc_short8 dst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - _viv_asm(CONV, result, tmpDstB); - dstPos.z = bOrder; - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpdst, 16); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(CONV, result, tmpDstG); - dstPos.z = 1; - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpdst, 16); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(CONV, result, tmpDstR); - dstPos.z = rOrder; - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpdst, 16); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx new file mode 100644 index 0000000..25f981a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx @@ -0,0 +1,238 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4; +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; + +_viv_uniform VXC_512Bits uniCalculateG1st_4x4; +_viv_uniform VXC_512Bits uniCalculateG2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateG3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateG4th_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4; +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; + +_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8; +_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform float output_zp; +_viv_uniform float output_scale; + +#define YUV420_COPY_SH_IMPL(name, dst_type) \ +__kernel void pre_process_yuv420_copy_##name \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t u_img, \ + __read_only image2d_array_t v_img, \ + __write_only image2d_array_t output, \ + global int * xRatio, \ + global int * yRatio, \ + global int * xOffset, \ + global int * yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \ + int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \ + vxc_uchar16 Y; \ + vxc_uchar8 U, V; \ + vxc_int4 C0, C1, C2, C3; \ + vxc_uchar16 R, G, B; \ + dst_type dst0, dst1, dst2; \ + \ + VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + /*C = Y - 16;*/ \ + /*D = U - 128;*/ \ + /*E = V - 128;*/ \ + /* calculate R*/ \ + /* ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]*/ \ + int tmpV = -56992; \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \ + \ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + \ + /* calculate G*/ \ + /* ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]*/ \ + /* 298Y - 208V*/ \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \ + /* 34784 - 100U*/ \ + ushort tmpG = 34784; \ + vxc_ushort8 tmpDstG; \ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \ + VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \ + VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \ + \ + /* calculate B*/ \ + /* ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]*/ \ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \ + tmpV = -70688; \ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + \ + var *= output_scale; \ + float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \ + rMean * var - output_zp, var); \ + half4 paramData_f16; \ + _viv_asm(CONV, paramData_f16, paramData); \ + \ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \ + \ + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \ + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \ + \ + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \ + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \ + \ + pos = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + pos.z = bOrder; \ + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + pos.z = 1; \ + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + pos.z = rOrder; \ + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ +} +YUV420_COPY_SH_IMPL(U8toU8, vxc_uchar16) +YUV420_COPY_SH_IMPL(U8toI8, vxc_char16) + +#define YUV420_COPY_16BITS_SH_IMPL(name, dst_type) \ +__kernel void pre_process_yuv420_copy_##name \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t u_img, \ + __read_only image2d_array_t v_img, \ + __write_only image2d_array_t output, \ + global int * xRatio, \ + global int * yRatio, \ + global int * xOffset, \ + global int * yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \ + int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \ + vxc_uchar16 Y; \ + vxc_uchar8 U, V; \ + vxc_int4 C0, C1, C2, C3; \ + vxc_uchar16 R, G, B; \ + dst_type dst0, dst1, dst2, dst3, dst4, dst5; \ + vxc_short8 out0, out1, out2, out3, out4, out5; \ + \ + VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int tmpV = -56992; \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \ + \ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \ + \ + ushort tmpG = 34784; \ + vxc_ushort8 tmpDstG; \ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \ + VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \ + VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \ + \ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \ + tmpV = -70688; \ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + \ + var *= output_scale; \ + float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \ + rMean * var - output_zp, var); \ + half4 paramData_f16; \ + _viv_asm(CONV, paramData_f16, paramData); \ + \ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \ + VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \ + \ + VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \ + VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \ + \ + VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \ + VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \ + \ + _viv_asm(COPY, out0, dst0, 16); \ + _viv_asm(COPY, out1, dst1, 16); \ + _viv_asm(COPY, out2, dst2, 16); \ + _viv_asm(COPY, out3, dst3, 16); \ + _viv_asm(COPY, out4, dst4, 16); \ + _viv_asm(COPY, out5, dst5, 16); \ + \ + pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8); \ + VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + pos.z = 1; \ + VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + pos.z = rOrder; \ + VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +YUV420_COPY_16BITS_SH_IMPL(U8toF16, vxc_half8) +YUV420_COPY_16BITS_SH_IMPL(U8toI16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx deleted file mode 100644 index bce976c..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx +++ /dev/null @@ -1,240 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4; -_viv_uniform VXC_512Bits uniCalculateR1st_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; - -_viv_uniform VXC_512Bits uniCalculateG1st_4x4; -_viv_uniform VXC_512Bits uniCalculateG2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateG3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateG4th_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4; -_viv_uniform VXC_512Bits uniCalculateB1st_4x4; - -_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8; -_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8; -_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8; -_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8; -_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8; -_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8; - -_viv_uniform int bOrder; -_viv_uniform int rOrder; -_viv_uniform int zp; -_viv_uniform float outputScale; - -__kernel void pre_process_yuv420_copy_U8toU8( - __read_only image2d_t y_img, - __read_only image2d_t u_img, - __read_only image2d_t v_img, - __write_only image2d_array_t output, - global int * xRatio, - global int * yRatio, - global int * xOffset, - global int * yOffset, - float rMean, - float gMean, - float bMean, - float var, - int reverse_channel, - int trans - ) -{ - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); - int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); - vxc_uchar16 Y; - vxc_uchar8 U, V; - vxc_int4 C0, C1, C2, C3; - vxc_uchar16 R, G, B; - vxc_uchar16 dst0, dst1, dst2; - - VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - //C = Y - 16; - //D = U - 128; - //E = V - 128; - // calculate R - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] - int tmpV = -56992; - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); - - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - - // calculate G - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] - // 298Y - 208V - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); - // 34784 - 100U - ushort tmpG = 34784; - vxc_ushort8 tmpDstG; - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); - VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); - VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); - - // calculate B - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); - tmpV = -70688; - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - - var *= outputScale; - float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\ - rMean * var - zp, var); - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); - VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); - - VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); - VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); - - VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); - VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); - - pos = (int4)(get_global_id(0), get_global_id(1), 0, 0); - pos.z = bOrder; - VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - pos.z = 1; - VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - pos.z = rOrder; - VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pre_process_yuv420_copy_U8toF16( - __read_only image2d_t y_img, - __read_only image2d_t u_img, - __read_only image2d_t v_img, - __write_only image2d_array_t output, - global int * xRatio, - global int * yRatio, - global int * xOffset, - global int * yOffset, - float rMean, - float gMean, - float bMean, - float var, - int reverse_channel, - int trans - ) -{ - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); - int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); - vxc_uchar16 Y; - vxc_uchar8 U, V; - vxc_int4 C0, C1, C2, C3; - vxc_uchar16 R, G, B; - vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5; - vxc_short8 out0, out1, out2, out3, out4, out5; - - VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - //C = Y - 16; - //D = U - 128; - //E = V - 128; - // calculate R - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] - int tmpV = -56992; - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); - - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - - // calculate G - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] - // 298Y - 208V - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); - // 34784 - 100U - ushort tmpG = 34784; - vxc_ushort8 tmpDstG; - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); - VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); - VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); - - // calculate B - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); - tmpV = -70688; - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - - float4 paramData = (float4)(bMean * var, gMean * var,\ - rMean * var, var); - half4 paramData_f16; - _viv_asm(CONV, paramData_f16, paramData); - - VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); - VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); - - VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); - VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); - - VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); - VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); - - _viv_asm(COPY, out0, dst0, 16); - _viv_asm(COPY, out1, dst1, 16); - _viv_asm(COPY, out2, dst2, 16); - _viv_asm(COPY, out3, dst3, 16); - _viv_asm(COPY, out4, dst4, 16); - _viv_asm(COPY, out5, dst5, 16); - - pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8); - VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - pos.z = 1; - VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - pos.z = rOrder; - VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx new file mode 100644 index 0000000..40db137 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx @@ -0,0 +1,237 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform float output_zp; +_viv_uniform float output_scale; + +#define YUV420_SCALE_8BITS_SH_IMPL(name, dst_type) \ +__kernel void pre_process_yuv420_scale_##name \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t u_img, \ + __read_only image2d_array_t v_img, \ + __write_only image2d_array_t output, \ + global int * xRatio, \ + global int * yRatio, \ + global int * xOffset, \ + global int * yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int4 gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + gidx += (int4)(0, 1, 2, 3); \ + \ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \ + int4 sx = fx & 0xffff8000; \ + int fy, sy; \ + fx -= sx; \ + sx = sx >> 15; \ + fx = (fx +(1 << 4)) >> 5; \ + \ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \ + sy = fy & 0xffff8000; \ + fy -= sy; \ + sy = sy >> 15; \ + \ + sy = sy < 0 ? 0 : sy; \ + fy = fy < 0 ? 0 : fy; \ + \ + fy = (fy + (1<< 4)) >> 5; \ + sx += (*xOffset); \ + sy += (*yOffset); \ + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \ + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \ + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \ + \ + vxc_uchar16 Y, U, V; \ + vxc_int4 C0, C1, C2, C3; \ + vxc_uchar16 R, G, B; \ + \ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + srcPos1.x = (sx.x + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + srcPos2.x = (sx.x + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.y; \ + srcPos1.x = sx.y >> 1; \ + srcPos2.x = sx.y >> 1; \ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \ + srcPos1.x = (sx.y + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \ + srcPos2.x = (sx.y + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.z; \ + srcPos1.x = sx.z >> 1; \ + srcPos2.x = sx.z >> 1; \ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \ + srcPos1.x = (sx.z + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \ + srcPos2.x = (sx.z + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.w; \ + srcPos1.x = sx.w >> 1; \ + srcPos2.x = sx.w >> 1; \ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \ + srcPos1.x = (sx.w + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \ + srcPos2.x = (sx.w + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + int tmpV = -56992; \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \ + \ + ushort tmpG = 34784; \ + vxc_ushort8 tmpDstG, tmpDstG1; \ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ + \ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \ + tmpV = -70688; \ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + \ + int4 result, temp1, temp2; \ + int4 tmpData0, tmpData1; \ + \ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + \ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + \ + tmpV = 1 << 19; \ + dst_type dst; \ + float4 tmpDst; \ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - bMean) * var; \ + dstPos.z = bOrder; \ + result = convert_int4_rte(tmpDst * output_scale + output_zp); \ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - gMean) * var; \ + dstPos.z = 1; \ + result = convert_int4_rte(tmpDst * output_scale + output_zp); \ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - rMean) * var; \ + dstPos.z = rOrder; \ + result = convert_int4_rte(tmpDst * output_scale + output_zp); \ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +YUV420_SCALE_8BITS_SH_IMPL(U8toU8, vxc_uchar8) +YUV420_SCALE_8BITS_SH_IMPL(U8toI8, vxc_char8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx new file mode 100644 index 0000000..7bfa6d1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx @@ -0,0 +1,245 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniCalculateR1st_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; + +_viv_uniform VXC_512Bits uniCalculateB1st_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDescaleU8_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise_4x4; +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; + +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; + +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; + +_viv_uniform int bOrder; +_viv_uniform int rOrder; +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define YUV420_SCALE_16BITS_SH_IMPL(name, dst_type, conv_type) \ +__kernel void pre_process_yuv420_scale_##name \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t u_img, \ + __read_only image2d_array_t v_img, \ + __write_only image2d_array_t output, \ + global int * xRatio, \ + global int * yRatio, \ + global int * xOffset, \ + global int * yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float var, \ + int reverse_channel, \ + int trans \ + ) \ +{ \ + int4 gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + gidx += (int4)(0, 1, 2, 3); \ + \ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \ + int4 sx = fx & 0xffff8000; \ + int fy, sy; \ + fx -= sx; \ + sx = sx >> 15; \ + fx = (fx +(1 << 4)) >> 5; \ + \ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \ + sy = fy & 0xffff8000; \ + fy -= sy; \ + sy = sy >> 15; \ + \ + sy = sy < 0 ? 0 : sy; \ + fy = fy < 0 ? 0 : fy; \ + \ + fy = (fy + (1<< 4)) >> 5; \ + sx += (*xOffset); \ + sy += (*yOffset); \ + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \ + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \ + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \ + \ + vxc_uchar16 Y, U, V; \ + vxc_int4 C0, C1, C2, C3; \ + vxc_uchar16 R, G, B; \ + \ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + srcPos1.x = (sx.x + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + srcPos2.x = (sx.x + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.y; \ + srcPos1.x = sx.y >> 1; \ + srcPos2.x = sx.y >> 1; \ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \ + srcPos1.x = (sx.y + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \ + srcPos2.x = (sx.y + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.z; \ + srcPos1.x = sx.z >> 1; \ + srcPos2.x = sx.z >> 1; \ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \ + srcPos1.x = (sx.z + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \ + srcPos2.x = (sx.z + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \ + \ + srcPos.x = sx.w; \ + srcPos1.x = sx.w >> 1; \ + srcPos2.x = sx.w >> 1; \ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \ + srcPos1.x = (sx.w + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \ + srcPos2.x = (sx.w + 1) >> 1; \ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + int tmpV = -56992; \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \ + \ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \ + \ + ushort tmpG = 34784; \ + vxc_ushort8 tmpDstG, tmpDstG1; \ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \ + \ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \ + tmpV = -70688; \ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ + \ + int4 result, temp1, temp2; \ + int4 tmpData0, tmpData1; \ + dst_type tmpResult; \ + conv_type tmpVal; \ + \ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + \ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + \ + tmpV = 1 << 19; \ + vxc_short8 dst; \ + float4 tmpDst; \ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - bMean) * var; \ + dstPos.z = bOrder; \ + tmpDst = tmpDst * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal, tmpDst); \ + VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmpResult, 8); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - gMean) * var; \ + dstPos.z = 1; \ + tmpDst = tmpDst * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal, tmpDst); \ + VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmpResult, 8); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \ + temp1 = fx * tmpData0 + tmpData1; \ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \ + temp2 = fx * tmpData0 + tmpData1; \ + result = fy * temp2 + (temp1 << 10); \ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ + tmpDst = (tmpDst - rMean) * var; \ + dstPos.z = rOrder; \ + tmpDst = tmpDst * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal, tmpDst); \ + VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmpResult, 8); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +YUV420_SCALE_16BITS_SH_IMPL(U8toF16, vxc_half8, half4) +YUV420_SCALE_16BITS_SH_IMPL(U8toI16, vxc_short8, int4) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx deleted file mode 100644 index 9d4e331..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx +++ /dev/null @@ -1,232 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniCalculateR1st_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; - -_viv_uniform VXC_512Bits uniCalculateB1st_4x4; -_viv_uniform VXC_512Bits uniDescaleU8_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; - -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; - -_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; - -_viv_uniform int bOrder; -_viv_uniform int rOrder; - -__kernel void pre_process_yuv420_scale_U8toF16( - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, - __read_only image2d_array_t v_img, __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - int4 gidx = get_global_id(0); - int gidy = get_global_id(1); - gidx += (int4)(0, 1, 2, 3); - - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); - int4 sx = fx & 0xffff8000; // Floor - int fy, sy; - fx -= sx; - sx = sx >> 15; - fx = (fx +(1 << 4)) >> 5; - - // for y - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); - sy = fy & 0xffff8000; // Floor - fy -= sy; - sy = sy >> 15; - - sy = sy < 0 ? 0 : sy; - fy = fy < 0 ? 0 : fy; - - fy = (fy + (1<< 4)) >> 5; - sx += (*xOffset); - sy += (*yOffset); - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); - - vxc_uchar16 Y, U, V; - vxc_int4 C0, C1, C2, C3; - vxc_uchar16 R, G, B; - - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.y; - srcPos1.x = sx.y >> 1; - srcPos2.x = sx.y >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.z; - srcPos1.x = sx.z >> 1; - srcPos2.x = sx.z >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.w; - srcPos1.x = sx.w >> 1; - srcPos2.x = sx.w >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - - //C = Y - 16; D = U - 128; E = V - 128; - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] - int tmpV = -56992; - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] - // 298Y - 208V - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); - // 34784 - 100U - ushort tmpG = 34784; - vxc_ushort8 tmpDstG, tmpDstG1; - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); - tmpV = -70688; - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - - int4 result, temp1, temp2; - int4 tmpData0, tmpData1; - - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - // temp2 - temp1 - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - - vxc_half8 tmpVal; - half4 hDst; - tmpV = 1 << 19; - vxc_short8 dst; - float4 tmpDst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - bMean) * var; - dstPos.z = bOrder; - _viv_asm(CONV, hDst, tmpDst); - VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpVal, 16); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - gMean) * var; - dstPos.z = 1; - _viv_asm(CONV, hDst, tmpDst); - VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpVal, 16); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - rMean) * var; - dstPos.z = rOrder; - _viv_asm(CONV, hDst, tmpDst); - VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpVal, 16); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx deleted file mode 100644 index 8bc4c0b..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx +++ /dev/null @@ -1,227 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniCalculateR1st_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; - -_viv_uniform VXC_512Bits uniCalculateB1st_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform VXC_512Bits uniDescaleU8_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; - -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; - -_viv_uniform int bOrder; -_viv_uniform int rOrder; -_viv_uniform float outputScale; - -__kernel void pre_process_yuv420_scale_U8toI16( - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, - __read_only image2d_array_t v_img, __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - int4 gidx = get_global_id(0); - int gidy = get_global_id(1); - gidx += (int4)(0, 1, 2, 3); - - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); - int4 sx = fx & 0xffff8000; // Floor - int fy, sy; - fx -= sx; - sx = sx >> 15; - fx = (fx +(1 << 4)) >> 5; - - // for y - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); - sy = fy & 0xffff8000; // Floor - fy -= sy; - sy = sy >> 15; - - sy = sy < 0 ? 0 : sy; - fy = fy < 0 ? 0 : fy; - - fy = (fy + (1<< 4)) >> 5; - sx += (*xOffset); - sy += (*yOffset); - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); - - vxc_uchar16 Y, U, V; - vxc_int4 C0, C1, C2, C3; - vxc_uchar16 R, G, B; - - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.y; - srcPos1.x = sx.y >> 1; - srcPos2.x = sx.y >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.z; - srcPos1.x = sx.z >> 1; - srcPos2.x = sx.z >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.w; - srcPos1.x = sx.w >> 1; - srcPos2.x = sx.w >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - - //C = Y - 16; D = U - 128; E = V - 128; - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] - int tmpV = -56992; - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] - // 298Y - 208V - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); - // 34784 - 100U - ushort tmpG = 34784; - vxc_ushort8 tmpDstG, tmpDstG1; - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); - tmpV = -70688; - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - - int4 result, temp1, temp2; - int4 tmpData0, tmpData1; - - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - // temp2 - temp1 - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - - tmpV = 1 << 19; - vxc_short8 dst; - float4 tmpDst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - bMean) * var; - dstPos.z = bOrder; - result = convert_int4_rte(tmpDst * outputScale); - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - gMean) * var; - dstPos.z = 1; - result = convert_int4_rte(tmpDst * outputScale); - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - rMean) * var; - dstPos.z = rOrder; - result = convert_int4_rte(tmpDst * outputScale); - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx deleted file mode 100644 index d3150b0..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx +++ /dev/null @@ -1,227 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniCalculateR1st_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; - -_viv_uniform VXC_512Bits uniCalculateB1st_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform VXC_512Bits uniDescaleU8_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; - -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; - -_viv_uniform int bOrder; -_viv_uniform int rOrder; -_viv_uniform float outputScale; - -__kernel void pre_process_yuv420_scale_U8toI8( - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, - __read_only image2d_array_t v_img, __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - int4 gidx = get_global_id(0); - int gidy = get_global_id(1); - gidx += (int4)(0, 1, 2, 3); - - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); - int4 sx = fx & 0xffff8000; // Floor - int fy, sy; - fx -= sx; - sx = sx >> 15; - fx = (fx +(1 << 4)) >> 5; - - // for y - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); - sy = fy & 0xffff8000; // Floor - fy -= sy; - sy = sy >> 15; - - sy = sy < 0 ? 0 : sy; - fy = fy < 0 ? 0 : fy; - - fy = (fy + (1<< 4)) >> 5; - sx += (*xOffset); - sy += (*yOffset); - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); - - vxc_uchar16 Y, U, V; - vxc_int4 C0, C1, C2, C3; - vxc_uchar16 R, G, B; - - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.y; - srcPos1.x = sx.y >> 1; - srcPos2.x = sx.y >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.z; - srcPos1.x = sx.z >> 1; - srcPos2.x = sx.z >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.w; - srcPos1.x = sx.w >> 1; - srcPos2.x = sx.w >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - - //C = Y - 16; D = U - 128; E = V - 128; - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] - int tmpV = -56992; - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] - // 298Y - 208V - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); - // 34784 - 100U - ushort tmpG = 34784; - vxc_ushort8 tmpDstG, tmpDstG1; - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); - tmpV = -70688; - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - - int4 result, temp1, temp2; - int4 tmpData0, tmpData1; - - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - // temp2 - temp1 - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - - tmpV = 1 << 19; - vxc_char8 dst; - float4 tmpDst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - bMean) * var; - dstPos.z = bOrder; - result = convert_int4_rte(tmpDst * outputScale); - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - gMean) * var; - dstPos.z = 1; - result = convert_int4_rte(tmpDst * outputScale); - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - rMean) * var; - dstPos.z = rOrder; - result = convert_int4_rte(tmpDst * outputScale); - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx deleted file mode 100644 index 6a0340b..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx +++ /dev/null @@ -1,228 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniCalculateR1st_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8; -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8; - -_viv_uniform VXC_512Bits uniCalculateB1st_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform VXC_512Bits uniDescaleU8_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise_4x4; -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4; - -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4; -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4; - -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4; -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4; - -_viv_uniform int bOrder; -_viv_uniform int rOrder; -_viv_uniform int zp; -_viv_uniform float outputScale; - -__kernel void pre_process_yuv420_scale_U8toU8( - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img, - __read_only image2d_array_t v_img, __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) -{ - int4 gidx = get_global_id(0); - int gidy = get_global_id(1); - gidx += (int4)(0, 1, 2, 3); - - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); - int4 sx = fx & 0xffff8000; // Floor - int fy, sy; - fx -= sx; - sx = sx >> 15; - fx = (fx +(1 << 4)) >> 5; - - // for y - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); - sy = fy & 0xffff8000; // Floor - fy -= sy; - sy = sy >> 15; - - sy = sy < 0 ? 0 : sy; - fy = fy < 0 ? 0 : fy; - - fy = (fy + (1<< 4)) >> 5; - sx += (*xOffset); - sy += (*yOffset); - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); - - vxc_uchar16 Y, U, V; - vxc_int4 C0, C1, C2, C3; - vxc_uchar16 R, G, B; - - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.x + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.y; - srcPos1.x = sx.y >> 1; - srcPos2.x = sx.y >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.y + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.z; - srcPos1.x = sx.z >> 1; - srcPos2.x = sx.z >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.z + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); - - srcPos.x = sx.w; - srcPos1.x = sx.w >> 1; - srcPos2.x = sx.w >> 1; - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); - srcPos1.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); - - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); - srcPos2.x = (sx.w + 1) >> 1; - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); - - //C = Y - 16; D = U - 128; E = V - 128; - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] - int tmpV = -56992; - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); - - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] - // 298Y - 208V - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); - // 34784 - 100U - ushort tmpG = 34784; - vxc_ushort8 tmpDstG, tmpDstG1; - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); - - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); - tmpV = -70688; - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - - int4 result, temp1, temp2; - int4 tmpData0, tmpData1; - - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - // temp2 - temp1 - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - - tmpV = 1 << 19; - vxc_uchar8 dst; - float4 tmpDst; - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - bMean) * var; - dstPos.z = bOrder; - result = convert_int4_rte(tmpDst * outputScale + zp); - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - gMean) * var; - dstPos.z = 1; - result = convert_int4_rte(tmpDst * outputScale + zp); - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); - temp1 = fx * tmpData0 + tmpData1; - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); - temp2 = fx * tmpData0 + tmpData1; - result = fy * temp2 + (temp1 << 10); - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - rMean) * var; - dstPos.z = rOrder; - result = convert_int4_rte(tmpDst * outputScale + zp); - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx new file mode 100644 index 0000000..f63e65c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx @@ -0,0 +1,88 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +_viv_uniform float outputScaleVar; +_viv_uniform float bMeanScaleVarZp; +_viv_uniform float gMeanScaleVarZp; +_viv_uniform float rMeanScaleVarZp; + +_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4; +_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4; +_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4; + +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; + +#define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ +__kernel void pre_process_yuv422_copy_##name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int* xRatio, \ + global int* yRatio, \ + global int* xOffset, \ + global int* yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float var, \ + int reverse_channel, \ + int trans, \ + int yuv422_type \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int sy = gidy + (*yOffset); \ + int sx = gidx + (*xOffset * 2); \ + \ + vxc_uchar8 YUV; \ + vxc_short8 tmpYUV; \ + \ + VXC_ReadImage(YUV, input, (int2)(sx,sy), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + if (yuv422_type == 1) \ + { \ + YUV.s01234567 = YUV.s10325476; \ + } \ +\ + short tmpVal = 128; \ + VXC_DP2x8(tmpYUV, YUV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \ + \ + float4 tmpDstB, tmpDstG, tmpDstR; \ + VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \ + VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \ + VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \ + \ + conv_type result; \ + dst_type dst0; \ + save_type dst; \ + int4 dstPos = (int4)(gidx, gidy, 0, 0); \ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstB); \ + dstPos.z = bOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstG); \ + dstPos.z = 1; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstR); \ + dstPos.z = rOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +YUV422_COPY_SH_IMPL(U8toU8, vxc_uchar4, int4, vxc_uchar4, 4) +YUV422_COPY_SH_IMPL(U8toI8, vxc_char4, int4, vxc_char4, 4) +YUV422_COPY_SH_IMPL(U8toI16, vxc_short4, int4, vxc_short4, 8) +YUV422_COPY_SH_IMPL(U8toF16, vxc_half4, half4, vxc_short4, 8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx new file mode 100644 index 0000000..ff85f8e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx @@ -0,0 +1,132 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +_viv_uniform float outputScaleVar; +_viv_uniform float bMeanScaleVarZp; +_viv_uniform float gMeanScaleVarZp; +_viv_uniform float rMeanScaleVarZp; + +_viv_uniform uint xrIntFloat_16; +_viv_uniform uint yrIntFloat_16; + +_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4; +_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4; +_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4; + +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; + +#define uyvy422 1 + +#define YUV422_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ +__kernel void pre_process_yuv422_scale_##name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int* xRatio, \ + global int* yRatio, \ + global int* xOffset, \ + global int* yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float var, \ + int reverse_channel, \ + int trans, \ + int yuv422_type \ + ) \ +{ \ + int4 gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + gidx += (int4)(0, 1, 2, 3); \ + \ + uint dy = (convert_uint(gidy) * yrIntFloat_16) >> 16; \ + uint4 dx = (convert_uint4(gidx) * xrIntFloat_16) >> 16; \ + int sy = convert_int(dy) + (*yOffset); \ + int4 sx = convert_int4(dx)+ (*xOffset * 2); \ + \ + vxc_uchar4 Y; \ + vxc_uchar8 UV; \ + vxc_char8 tmpUV; \ + short tmpVal = 128; \ + int y_offset = 0; \ + int u_offset = 1; \ + int v_offset = 3; \ +\ + if (yuv422_type == uyvy422) \ + { \ + y_offset = 1; \ + u_offset = 0; \ + v_offset = 2; \ + } \ +\ + int4 coord_Y = (int4)(sx.x * 2 + y_offset, sy, 0, 0); \ + int4 coord_U = (int4)((sx.x >> 1) * 4 + u_offset, sy, 0, 0); \ + int4 coord_V = (int4)((sx.x >> 1) * 4 + v_offset, sy, 0, 0); \ +\ + VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_Y.x = sx.y * 2 + y_offset; \ + VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_Y.x = sx.z * 2 + y_offset; \ + VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_Y.x = sx.w * 2 + y_offset; \ + VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + sx = (sx >> 1) * 4 + u_offset; \ + VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_U.x = sx.y; \ + VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_U.x = sx.z; \ + VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_U.x = sx.w; \ + VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ +\ + sx = sx - u_offset + v_offset; \ + VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \ + coord_V.x = sx.y; \ + VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_V.x = sx.z; \ + VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \ + coord_V.x = sx.w; \ + VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \ + vxc_uchar4 dst_test; \ + VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ +\ + float4 tmpDstB, tmpDstG, tmpDstR; \ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \ + \ + conv_type result; \ + dst_type dst0; \ + save_type dst; \ + int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstB); \ + dstPos.z = bOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstG); \ + dstPos.z = 1; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstR); \ + dstPos.z = rOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} + +YUV422_SH_IMPL(U8toU8, vxc_uchar4, int4, vxc_uchar4, 4) +YUV422_SH_IMPL(U8toI8, vxc_char4, int4, vxc_char4, 4) +YUV422_SH_IMPL(U8toI16, vxc_short4, int4, vxc_short4, 8) +YUV422_SH_IMPL(U8toF16, vxc_half4, half4, vxc_short4, 8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/select.vx b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx index ce788a4..5f72ad1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/select.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx @@ -12,15 +12,15 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp vxc_ushort8 mp0, mp1; \ _viv_asm(COPY, mp0, multAndoutZP0, 16); \ _viv_asm(COPY, mp1, multAndoutZP1, 16); \ - read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(src0, input0, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(src1, input1, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniU8MulAndPostShift0_Lo_2x8); \ VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniU8MulAndPostShift1_Lo_2x8); \ - read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(value_tmp, condition, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_DP2x8(value, value_tmp, value_tmp,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \ @@ -60,11 +60,11 @@ SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8) #define SELECT_HALF(read_fun, write_fun) \ vxc_short8 src0, src1, dst, value; \ vxc_char8 value_tmp; \ - read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(src0, input0, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(src1, input1, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(value_tmp, condition, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_DP2x8(value, value_tmp, value_tmp,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \ @@ -91,37 +91,36 @@ __kernel void select_I8_F16_F16toF16_2D( SELECT_HALF(VXC_ReadImage, VXC_WriteImage) } -#define SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, read_fun, write_fun) \ - vxc_short8 src0, src1, dst, value; \ - vxc_half8 value0, value1; \ - src0_type r0; \ - src1_type r1; \ +#define SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, read_fun, write_fun) \ + save_type dst, value; \ + save_type dst0, dst1; \ + dst_type value0, value1; \ + src0_type src0; \ + src1_type src1; \ copy0_type v0; \ copy1_type v1; \ vxc_char8 value_tmp; \ vxc_ushort8 mp0, mp1; \ _viv_asm(COPY, mp0, multAndoutZP0, 16); \ _viv_asm(COPY, mp1, multAndoutZP1, 16); \ - read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, v0, src0, 16); \ - read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, v1, src1, 16); \ VXC_DP2x8(value0, v0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniU8MulAndPostShift0_Lo_2x8); \ - _viv_asm(COPY, src0, value0, 16); \ + _viv_asm(COPY, dst0, value0, 16); \ VXC_DP2x8(value1, v1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniU8MulAndPostShift1_Lo_2x8); \ - _viv_asm(COPY, src1, value1, 16); \ - read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ + _viv_asm(COPY, dst1, value1, 16); \ + read_fun(value_tmp, condition, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_DP2x8(value, value_tmp, value_tmp,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \ - dst = (value != 0 ? src0 : src1); \ + dst = (value != 0 ? dst0 : dst1); \ write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -#define SELECT_HYBRID_TOF16_FUN(name, src0_type, copy0_type, src1_type, copy1_type) \ +#define SELECT_HYBRID_FUN(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \ __kernel void select_##name( \ __read_only image2d_array_t condition, \ __read_only image2d_array_t input0, \ @@ -129,44 +128,62 @@ __kernel void select_##name( \ __write_only image2d_array_t output) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ - SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \ + SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type,\ VXC_ReadImage2DArray, VXC_WriteImage2DArray) \ } -SELECT_HYBRID_TOF16_FUN(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16) -SELECT_HYBRID_TOF16_FUN(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8) -SELECT_HYBRID_TOF16_FUN(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16) -SELECT_HYBRID_TOF16_FUN(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8) -SELECT_HYBRID_TOF16_FUN(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) -SELECT_HYBRID_TOF16_FUN(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8) +SELECT_HYBRID_FUN(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN(I8_F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8) +SELECT_HYBRID_FUN(I8_U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8) +SELECT_HYBRID_FUN(I8_F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char8, vxc_char8) +SELECT_HYBRID_FUN(I8_I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char8, vxc_char8) +SELECT_HYBRID_FUN(I8_F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8) +SELECT_HYBRID_FUN(I8_I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +SELECT_HYBRID_FUN(I8_U8_U8toF16, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN(I8_I8_I8toF16, vxc_char8, vxc_char8, vxc_char8, vxc_char8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN(I8_I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8) -#define SELECT_HYBRID_TOF16_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type) \ -__kernel void select_##name( \ +#define SELECT_HYBRID_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \ +__kernel void select_##name##_2D( \ __read_only image2d_array_t condition, \ __read_only image2d_array_t input0, \ __read_only image2d_array_t input1, \ __write_only image2d_array_t output) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ - SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \ + SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, \ VXC_ReadImage, VXC_WriteImage) \ } -SELECT_HYBRID_TOF16_FUN_2D(I8_F16_U8toF16_2D, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16) -SELECT_HYBRID_TOF16_FUN_2D(I8_U8_F16toF16_2D, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8) -SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I8toF16_2D, vxc_short8, vxc_half8, vxc_char16, vxc_char16) -SELECT_HYBRID_TOF16_FUN_2D(I8_I8_F16toF16_2D, vxc_char16, vxc_char16, vxc_short8, vxc_half8) -SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I16toF16_2D, vxc_short8, vxc_half8, vxc_short8, vxc_short8) -SELECT_HYBRID_TOF16_FUN_2D(I8_I16_F16toF16_2D, vxc_short8, vxc_short8, vxc_short8, vxc_half8) +SELECT_HYBRID_FUN_2D(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8) +SELECT_HYBRID_FUN_2D(I8_U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8) +SELECT_HYBRID_FUN_2D(I8_F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char8, vxc_char8) +SELECT_HYBRID_FUN_2D(I8_I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char8, vxc_char8) +SELECT_HYBRID_FUN_2D(I8_F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_U8_U8toF16, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_I8_I8toF16, vxc_char8, vxc_char8, vxc_char8, vxc_char8, vxc_half8, vxc_short8) +SELECT_HYBRID_FUN_2D(I8_I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8) #define SELECT_HALF_TO_QINT(read_fun, write_fun, dst_type) \ vxc_short8 src0, src1, tmp_dst, value; \ vxc_half8 data; \ dst_type dst; \ vxc_char8 value_tmp; \ - read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(src0, input0, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(src1, input1, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(value_tmp, condition, coord, 0, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_DP2x8(value, value_tmp, value_tmp,\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 2aedbce..fe52f46 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -2139,6 +2139,184 @@ BATCH_NORM_SH_IMPL_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char1 BATCH_NORM_SH_IMPL_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ "; /* end of batchnorm_single_f32_vx*/ +static const char bucketize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataConvert_1_4x4;\n\ +_viv_uniform int boundaries_size_x8;\n\ +_viv_uniform int boundaries_size;\n\ +\n\ +#define BUCKETIZE_16BITS_SH_IMPL(name, copy_type) \\\n\ +__kernel void bucketize_right_##name \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_t output \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + \\\n\ + vxc_short8 data0, data1; \\\n\ + copy_type src0, src1, dst0, dst1; \\\n\ + vxc_ushort8 v0, v1, v2, v3, result = 0; \\\n\ + VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + \\\n\ + for (; coord.z < boundaries_size_x8; ) \\\n\ + { \\\n\ + VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, data1.s00000000, 16); \\\n\ + coord.z += 8; \\\n\ + \\\n\ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, v0, dst0, 16); \\\n\ + v2 = sub_sat(v0, 0xFFFE); \\\n\ + _viv_asm(COPY, src1, data1.s11111111, 16); \\\n\ + VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, v1, dst1, 16); \\\n\ + v3 = sub_sat(v1, 0xFFFE); \\\n\ + \\\n\ + result = result + v2 + v3; \\\n\ + \\\n\ + _viv_asm(COPY, src1, data1.s22222222, 16); \\\n\ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, v0, dst0, 16); \\\n\ + v2 = sub_sat(v0, 0xFFFE); \\\n\ + _viv_asm(COPY, src1, data1.s33333333, 16); \\\n\ + VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, v1, dst1, 16); \\\n\ + v3 = sub_sat(v1, 0xFFFE); \\\n\ + \\\n\ + result = result + v2 + v3; \\\n\ + \\\n\ + _viv_asm(COPY, src1, data1.s44444444, 16); \\\n\ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, v0, dst0, 16); \\\n\ + v2 = sub_sat(v0, 0xFFFE); \\\n\ + _viv_asm(COPY, src1, data1.s55555555, 16); \\\n\ + VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, v1, dst1, 16); \\\n\ + v3 = sub_sat(v1, 0xFFFE); \\\n\ + \\\n\ + result = result + v2 + v3; \\\n\ + \\\n\ + _viv_asm(COPY, src1, data1.s66666666, 16); \\\n\ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, v0, dst0, 16); \\\n\ + v2 = sub_sat(v0, 0xFFFE); \\\n\ + _viv_asm(COPY, src1, data1.s77777777, 16); \\\n\ + VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, v1, dst1, 16); \\\n\ + v3 = sub_sat(v1, 0xFFFE); \\\n\ + \\\n\ + result = result + v2 + v3; \\\n\ + } \\\n\ + \\\n\ + for (; coord.z < boundaries_size; ) \\\n\ + { \\\n\ + VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, data1.s00000000, 16); \\\n\ + coord.z ++; \\\n\ + \\\n\ + VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, v0, dst0, 16); \\\n\ + v2 = sub_sat(v0, 0xFFFE); \\\n\ + \\\n\ + result = result + v2; \\\n\ + } \\\n\ + \\\n\ + int4 d0, d1; \\\n\ + VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \\\n\ + VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \\\n\ + coord.z = coord.x + 4; \\\n\ + \\\n\ + write_imagei(output, coord.xy, d0); \\\n\ + write_imagei(output, coord.zy, d1); \\\n\ +}\n\ +BUCKETIZE_16BITS_SH_IMPL(F16_F16toI32_2D, vxc_half8)\n\ +BUCKETIZE_16BITS_SH_IMPL(I16_I16toI32_2D, vxc_short8)\n\ +\n\ +#define BUCKETIZE_8BITS_SH_IMPL(name, src_type) \\\n\ +__kernel void bucketize_right_##name \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_t output \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + \\\n\ + src_type src0, src1, src2; \\\n\ + vxc_uchar8 dst0, dst1, result = 0; \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + for (; coord.z < boundaries_size_x8; ) \\\n\ + { \\\n\ + VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z += 8; \\\n\ + \\\n\ + VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, dst0, src2, 8); \\\n\ + dst0 = sub_sat(dst0, 0xFE); \\\n\ + VXC_Clamp(src2, src0, src1.s11111111, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, dst1, src2, 8); \\\n\ + dst1 = sub_sat(dst1, 0xFE); \\\n\ + \\\n\ + result = result + dst0 + dst1; \\\n\ + \\\n\ + VXC_Clamp(src2, src0, src1.s22222222, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, dst0, src2, 8); \\\n\ + dst0 = sub_sat(dst0, 0xFE); \\\n\ + VXC_Clamp(src2, src0, src1.s33333333, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, dst1, src2, 8); \\\n\ + dst1 = sub_sat(dst1, 0xFE); \\\n\ + \\\n\ + result = result + dst0 + dst1; \\\n\ + \\\n\ + VXC_Clamp(src2, src0, src1.s44444444, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, dst0, src2, 8); \\\n\ + dst0 = sub_sat(dst0, 0xFE); \\\n\ + VXC_Clamp(src2, src0, src1.s55555555, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, dst1, src2, 8); \\\n\ + dst1 = sub_sat(dst1, 0xFE); \\\n\ + \\\n\ + result = result + dst0 + dst1; \\\n\ + \\\n\ + VXC_Clamp(src2, src0, src1.s66666666, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, dst0, src2, 8); \\\n\ + dst0 = sub_sat(dst0, 0xFE); \\\n\ + VXC_Clamp(src2, src0, src1.s77777777, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, dst1, src2, 8); \\\n\ + dst1 = sub_sat(dst1, 0xFE); \\\n\ + \\\n\ + result = result + dst0 + dst1; \\\n\ + } \\\n\ + \\\n\ + for (; coord.z < boundaries_size; ) \\\n\ + { \\\n\ + VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z ++; \\\n\ + \\\n\ + VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\ + _viv_asm(COPY, dst0, src2, 8); \\\n\ + dst0 = sub_sat(dst0, 0xFE); \\\n\ + \\\n\ + result = result + dst0; \\\n\ + } \\\n\ + \\\n\ + int4 d0, d1; \\\n\ + VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \\\n\ + VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \\\n\ + coord.z = coord.x + 4; \\\n\ + \\\n\ + write_imagei(output, coord.xy, d0); \\\n\ + write_imagei(output, coord.zy, d1); \\\n\ +}\n\ +BUCKETIZE_8BITS_SH_IMPL(U8_U8toI32_2D, vxc_uchar8)\n\ +BUCKETIZE_8BITS_SH_IMPL(I8_I8toI32_2D, vxc_char8)\n\ +"; /* end of bucketize_vx*/ + static const char cast_vx[] = "\n\ #include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -9461,7 +9639,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ half4 tmpVal0, tmpVal1; \\\n\ float alpha = scale_vari; \\\n\ - float alpha = scale_vari * input_scale; \\\n\ + alpha = scale_vari * input_scale; \\\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\ bias_val = bias_val - input_zp * alpha; \\\n\ \\\n\ @@ -11438,20 +11616,14 @@ __kernel void hswish_BF16toBF16_2D(\n\ static const char instance_normalization_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform float inv_multiplier;\n\ -_viv_uniform int group_num;\n\ \n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform VXC_512Bits uniSum_X_X2_16x2;\n\ _viv_uniform float input_scale;\n\ _viv_uniform float input_scale2;\n\ -_viv_uniform float input_zp;\n\ _viv_uniform float sum_x_tail;\n\ _viv_uniform float sum_x2_tail0;\n\ _viv_uniform float sum_x2_tail1;\n\ -_viv_uniform float output_scale;\n\ -_viv_uniform float output_zp;\n\ \n\ _viv_uniform VXC_512Bits uniSumX_16x1;\n\ _viv_uniform VXC_512Bits uniSumX2_16x1;\n\ @@ -11460,7 +11632,7 @@ _viv_uniform VXC_512Bits uniSumX2_16x1;\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + float eps, int height) \\\n\ { \\\n\ int gidx = get_global_id(0) << 4; \\\n\ int lidx = get_local_id(0); \\\n\ @@ -11518,7 +11690,7 @@ INSTANCE_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + float eps, int height) \\\n\ { \\\n\ int gidx = get_global_id(0) << 4; \\\n\ int lidx = get_local_id(0); \\\n\ @@ -11571,18 +11743,62 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums INSTANCE_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)\n\ INSTANCE_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)\n\ \n\ +__kernel void instance_norm_means\n\ +(\n\ + __read_only image2d_t sums,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __write_only image2d_t means,\n\ + float eps,\n\ + float in_time_out_scale,\n\ + float input_zp,\n\ + float output_scale,\n\ + float output_zp,\n\ + float inv_multiplier,\n\ + int group_num\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + Image sums_img = create_image_from_image2d(sums, 4);\n\ + float4 *sums_ptr = (float4 *)get_image_ptr_from_coord(sums_img, coord);\n\ +\n\ + float alpha = read_imagef(scale, coord).x;\n\ + float beta = read_imagef(bias, coord).x;\n\ +\n\ + float4 mean_var = sums_ptr[0];\n\ + for(int i = 1; i < group_num;)\n\ + {\n\ + mean_var += sums_ptr[i];\n\ + i ++;\n\ + }\n\ +\n\ + mean_var *= inv_multiplier;\n\ + mean_var.s1 = mean_var.s1 - mean_var.s0 * mean_var.s0 + eps;\n\ + mean_var.s1 = rsqrt(mean_var.s1);\n\ +\n\ + alpha = alpha * mean_var.y;\n\ +\n\ + float4 dst;\n\ + dst.x = in_time_out_scale * alpha;\n\ + beta = (beta - alpha * mean_var.x) * output_scale + output_zp;\n\ + dst.y = beta - input_zp * dst.x;\n\ +\n\ + Image means_img = create_image_from_image2d(means, 4);\n\ + float4 *means_ptr = (float4 *)get_image_ptr_from_coord(means_img, coord);\n\ + means_ptr[0] = dst.xyxy;\n\ +}\n\ +\n\ _viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ _viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ _viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\ _viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\ #define INSTANCE_NORM_8BITS_IMPL(name, src_type, dst_type) \\\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\ +__kernel void instance_norm_##name( \\\n\ __read_only image2d_array_t input, \\\n\ - __read_only image2d_t bias, \\\n\ - __read_only image2d_t scale, \\\n\ - __read_only image2d_t meanVari, \\\n\ + __read_only image2d_t means, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + int height) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ @@ -11590,26 +11806,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na int2 coord_para = (int2)(0, gidz); \\\n\ src_type src0; \\\n\ dst_type dst; \\\n\ - float scale_vari, bias_val; \\\n\ - float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + float4 coef; \\\n\ \\\n\ - scale_f = read_imagef(scale, coord_para); \\\n\ - bias_f = read_imagef(bias, coord_para); \\\n\ - for(int i = 0; i < group_num; i++) \\\n\ - { \\\n\ - mean_vari += read_imagef(meanVari, coord_para); \\\n\ - coord_para.x += 4; \\\n\ - } \\\n\ - mean_vari *= inv_multiplier; \\\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ - mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ - \\\n\ - scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ - vxc_int4 tmpVal0, tmpVal1; \\\n\ + coef = read_imagef(means, coord_para); \\\n\ + int4 tmpVal0, tmpVal1; \\\n\ float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ - float alpha = input_scale * output_scale * scale_vari; \\\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ - bias_val = bias_val - input_zp * alpha; \\\n\ \\\n\ int8 input_desc, output_desc; \\\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ @@ -11628,14 +11829,14 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ - norm = tmpData0 * alpha + bias_val; \\\n\ + norm = tmpData0 * coef.x + coef.y; \\\n\ tmpVal0 = convert_int4_rte(norm); \\\n\ - norm = tmpData1 * alpha + bias_val; \\\n\ + norm = tmpData1 * coef.x + coef.y; \\\n\ tmpVal1 = convert_int4_rte(norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ - norm = tmpData2 * alpha + bias_val; \\\n\ + norm = tmpData2 * coef.x + coef.y; \\\n\ tmpVal0 = convert_int4_rte(norm); \\\n\ - norm = tmpData3 * alpha + bias_val; \\\n\ + norm = tmpData3 * coef.x + coef.y; \\\n\ tmpVal1 = convert_int4_rte(norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ VXC_OP4_NoDest(img_store_3d, output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ @@ -11645,60 +11846,46 @@ INSTANCE_NORM_8BITS_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\ INSTANCE_NORM_8BITS_IMPL(I8_F32toI8, vxc_char16, vxc_char16)\n\ \n\ #define INSTANCE_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \\\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\ +__kernel void instance_norm_##name##_2D( \\\n\ __read_only image2d_array_t input, \\\n\ - __read_only image2d_t bias, \\\n\ - __read_only image2d_t scale, \\\n\ - __read_only image2d_t meanVari, \\\n\ + __read_only image2d_t means, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + int height) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int gidy = gidz * height; \\\n\ - int2 coord = (int2)(get_global_id(0), gidy); \\\n\ + int4 coord; \\\n\ int2 coord_para = (int2)(0, gidz); \\\n\ int endH = gidy + height; \\\n\ src_type src0; \\\n\ dst_type dst; \\\n\ - float scale_vari, bias_val; \\\n\ - float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + float4 coef; \\\n\ \\\n\ - scale_f = read_imagef(scale, coord_para); \\\n\ - bias_f = read_imagef(bias, coord_para); \\\n\ - for(int i = 0; i < group_num; i++) \\\n\ - { \\\n\ - mean_vari += read_imagef(meanVari, coord_para); \\\n\ - coord_para.x += 4; \\\n\ - } \\\n\ - mean_vari *= inv_multiplier; \\\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ - mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ - \\\n\ - scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ - vxc_int4 tmpVal0, tmpVal1; \\\n\ + coef = read_imagef(means, coord_para); \\\n\ + int4 tmpVal0, tmpVal1; \\\n\ float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ - float alpha = input_scale * output_scale * scale_vari; \\\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ - bias_val = bias_val - input_zp * alpha; \\\n\ \\\n\ - for(; coord.y < endH; coord.y++) \\\n\ + coord = (int4)(get_global_id(0), gidy, gidy - 1, gidy - 1); \\\n\ + \\\n\ + for(; coord.y < endH; ) \\\n\ { \\\n\ - VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.yz++; \\\n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ - norm = tmpData0 * alpha + bias_val; \\\n\ + norm = tmpData0 * coef.x + coef.y; \\\n\ tmpVal0 = convert_int4_rte(norm); \\\n\ - norm = tmpData1 * alpha + bias_val; \\\n\ + norm = tmpData1 * coef.x + coef.y; \\\n\ tmpVal1 = convert_int4_rte(norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ - norm = tmpData2 * alpha + bias_val; \\\n\ + norm = tmpData2 * coef.x + coef.y; \\\n\ tmpVal0 = convert_int4_rte(norm); \\\n\ - norm = tmpData3 * alpha + bias_val; \\\n\ + norm = tmpData3 * coef.x + coef.y; \\\n\ tmpVal1 = convert_int4_rte(norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord.xz, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ } \\\n\ }\n\ INSTANCE_NORM_8BITS_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\ @@ -11706,12 +11893,6 @@ INSTANCE_NORM_8BITS_IMPL_2D(I8_F32toI8, vxc_char16, vxc_char16)"; /* end of ins static const char instance_normalization_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -_viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform float inv_multiplier;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform float input_zp;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ \n\ _viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ @@ -11720,13 +11901,11 @@ _viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\ _viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\ \n\ #define INSTANCE_NORM_8_TO_F16_IMPL(name, src_type) \\\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\ +__kernel void instance_norm_##name( \\\n\ __read_only image2d_array_t input, \\\n\ - __read_only image2d_t bias, \\\n\ - __read_only image2d_t scale, \\\n\ - __read_only image2d_t meanVari, \\\n\ + __read_only image2d_t means, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + int height) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ @@ -11734,25 +11913,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na src_type src0; \\\n\ vxc_short8 outval; \\\n\ vxc_half8 dst; \\\n\ - float scale_vari, bias_val; \\\n\ - float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + float4 coef; \\\n\ \\\n\ - scale_f = read_imagef(scale, coord_para.xy); \\\n\ - bias_f = read_imagef(bias, coord_para.xy); \\\n\ - for(int i = 0; i < group_num; i++) \\\n\ - { \\\n\ - mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\ - coord_para.x += 4; \\\n\ - } \\\n\ - mean_vari *= inv_multiplier; \\\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ - mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ - scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + coef = read_imagef(means, coord_para.xy); \\\n\ float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ half4 tmpVal0, tmpVal1; \\\n\ - float alpha = scale_vari * input_scale; \\\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\ - bias_val = bias_val - input_zp * alpha; \\\n\ \\\n\ coord_para = coord; \\\n\ int8 input_desc, output_desc; \\\n\ @@ -11773,17 +11938,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ - norm = alpha * tmpData0 + bias_val; \\\n\ + norm = tmpData0 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal0, norm); \\\n\ - norm = alpha * tmpData1 + bias_val; \\\n\ + norm = tmpData1 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal1, norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, outval, dst, 16); \\\n\ VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ coord_para.x += 8; \\\n\ - norm = alpha * tmpData2 + bias_val; \\\n\ + norm = tmpData2 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal0, norm); \\\n\ - norm = alpha * tmpData3 + bias_val; \\\n\ + norm = tmpData3 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal1, norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, outval, dst, 16); \\\n\ @@ -11794,13 +11959,11 @@ INSTANCE_NORM_8_TO_F16_IMPL(U8_F32toF16, vxc_uchar16)\n\ INSTANCE_NORM_8_TO_F16_IMPL(I8_F32toF16, vxc_char16)\n\ \n\ #define INSTANCE_NORM_8_TO_F16_IMPL_2D(name, src_type) \\\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\ +__kernel void instance_norm_##name##_2D( \\\n\ __read_only image2d_array_t input, \\\n\ - __read_only image2d_t bias, \\\n\ - __read_only image2d_t scale, \\\n\ - __read_only image2d_t meanVari, \\\n\ + __read_only image2d_t means, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + int height) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int gidy = gidz * height; \\\n\ @@ -11810,26 +11973,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na src_type src0; \\\n\ vxc_short8 outval; \\\n\ vxc_half8 dst; \\\n\ - float scale_vari, bias_val; \\\n\ - float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + float4 coef; \\\n\ \\\n\ - scale_f = read_imagef(scale, coord_para.xy); \\\n\ - bias_f = read_imagef(bias, coord_para.xy); \\\n\ - for(int i = 0; i < group_num; i++) \\\n\ - { \\\n\ - mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\ - coord_para.x += 4; \\\n\ - } \\\n\ - mean_vari *= inv_multiplier; \\\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ - mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ - \\\n\ - scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + coef = read_imagef(means, coord_para.xy); \\\n\ float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ half4 tmpVal0, tmpVal1; \\\n\ - float alpha = scale_vari * input_scale; \\\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\ - bias_val = bias_val - input_zp * alpha; \\\n\ for(; coord.y < endH;) \\\n\ { \\\n\ VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -11839,17 +11987,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ - norm = alpha * tmpData0 + bias_val; \\\n\ + norm = tmpData0 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal0, norm); \\\n\ - norm = alpha * tmpData1 + bias_val; \\\n\ + norm = tmpData1 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal1, norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, outval, dst, 16); \\\n\ VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_para.x += 8; \\\n\ - norm = alpha * tmpData2 + bias_val; \\\n\ + norm = tmpData2 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal0, norm); \\\n\ - norm = alpha * tmpData3 + bias_val; \\\n\ + norm = tmpData3 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal1, norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, outval, dst, 16); \\\n\ @@ -11863,28 +12011,21 @@ INSTANCE_NORM_8_TO_F16_IMPL_2D(I8_F32toF16, vxc_char16)\n\ static const char instance_normalization_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform float inv_multiplier;\n\ -_viv_uniform int group_num;\n\ _viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ _viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\ _viv_uniform float input_scale;\n\ _viv_uniform float input_scale2;\n\ -_viv_uniform float input_zp;\n\ _viv_uniform float sum_x_tail;\n\ _viv_uniform float sum_x2_tail0;\n\ _viv_uniform float sum_x2_tail1;\n\ \n\ -_viv_uniform float output_scale;\n\ -_viv_uniform float output_zp;\n\ -\n\ #define INSTANCE_NORM_SUMS_16BITS_IMPL(name, src_type) \\\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + float eps, int height) \\\n\ { \\\n\ int gidx = get_global_id(0) << 3; \\\n\ int lidx = get_local_id(0); \\\n\ @@ -11949,7 +12090,7 @@ INSTANCE_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + float eps, int height) \\\n\ { \\\n\ int gidx = get_global_id(0) << 3; \\\n\ int lidx = get_local_id(0); \\\n\ @@ -12008,13 +12149,11 @@ INSTANCE_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)\n\ INSTANCE_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)\n\ \n\ #define INSTANCE_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\ +__kernel void instance_norm_##name( \\\n\ __read_only image2d_array_t input, \\\n\ - __read_only image2d_t bias, \\\n\ - __read_only image2d_t scale, \\\n\ - __read_only image2d_t meanVari, \\\n\ + __read_only image2d_t means, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + int height) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ @@ -12022,28 +12161,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\ vxc_short8 src0; \\\n\ src_type in_h; \\\n\ - float scale_vari, bias_val; \\\n\ - float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + float4 coef; \\\n\ \\\n\ - scale_f = read_imagef(scale, coord_para.xy); \\\n\ - bias_f = read_imagef(bias, coord_para.xy); \\\n\ + coef = read_imagef(means, coord_para.xy); \\\n\ \\\n\ - for(int i = 0; i < group_num; i++) \\\n\ - { \\\n\ - mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\ - coord_para.x += 4; \\\n\ - } \\\n\ - mean_vari *= inv_multiplier; \\\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ - mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ - \\\n\ - scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ - float alpha = input_scale * output_scale * scale_vari; \\\n\ float4 tmpData0, tmpData1; \\\n\ copy_type outval; \\\n\ conv_type tmpVal0, tmpVal1; \\\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ - bias_val = bias_val - input_zp * alpha; \\\n\ dst_type dst; \\\n\ \\\n\ int8 input_desc, output_desc; \\\n\ @@ -12066,9 +12190,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ \\\n\ float4 norm; \\\n\ - norm = alpha * tmpData0 + bias_val; \\\n\ + norm = tmpData0 * coef.x + coef.y; \\\n\ _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ - norm = alpha * tmpData1 + bias_val; \\\n\ + norm = tmpData1 * coef.x + coef.y; \\\n\ _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, outval, dst, 16); \\\n\ @@ -12083,13 +12207,11 @@ INSTANCE_NORM_16BITS_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4 INSTANCE_NORM_16BITS_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ \n\ #define INSTANCE_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\ +__kernel void instance_norm_##name##_2D( \\\n\ __read_only image2d_array_t input, \\\n\ - __read_only image2d_t bias, \\\n\ - __read_only image2d_t scale, \\\n\ - __read_only image2d_t meanVari, \\\n\ + __read_only image2d_t means, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int rs_flag) \\\n\ + int height) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int gidy = gidz * height; \\\n\ @@ -12098,28 +12220,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na int endH = gidy + height; \\\n\ vxc_short8 src0; \\\n\ src_type in_h; \\\n\ - float scale_vari, bias_val; \\\n\ - float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + float4 coef; \\\n\ \\\n\ - scale_f = read_imagef(scale, coord_para.xy); \\\n\ - bias_f = read_imagef(bias, coord_para.xy); \\\n\ + coef = read_imagef(means, coord_para.xy); \\\n\ \\\n\ - for(int i = 0; i < group_num; i++) \\\n\ - { \\\n\ - mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\ - coord_para.x += 4; \\\n\ - } \\\n\ - mean_vari *= inv_multiplier; \\\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ - mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ - \\\n\ - scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ - float alpha = input_scale * output_scale * scale_vari; \\\n\ float4 tmpData0, tmpData1; \\\n\ copy_type outval; \\\n\ conv_type tmpVal0, tmpVal1; \\\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ - bias_val = bias_val - input_zp * alpha; \\\n\ dst_type dst; \\\n\ \\\n\ for(; coord.y < endH; coord.y++) \\\n\ @@ -12130,9 +12237,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ float4 norm; \\\n\ - norm = alpha * tmpData0 + bias_val; \\\n\ + norm = tmpData0 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal0, norm); \\\n\ - norm = alpha * tmpData1 + bias_val; \\\n\ + norm = tmpData1 * coef.x + coef.y; \\\n\ _viv_asm(CONV, tmpVal1, norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, outval, dst, 16); \\\n\ @@ -12150,15 +12257,13 @@ INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, h static const char instance_normalization_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform float inv_multiplier;\n\ -_viv_uniform int group_num;\n\ +\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16(\n\ - image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ + image2d_array_t input, image2d_array_t output, float eps, int height)\n\ {\n\ int gidx = get_global_id(0) << 3;\n\ int lidx = get_local_id(0);\n\ @@ -12219,7 +12324,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16_2D(\n\ - image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ + image2d_array_t input, image2d_array_t output, float eps, int height)\n\ {\n\ int gidx = get_global_id(0) << 3;\n\ int lidx = get_local_id(0);\n\ @@ -12278,36 +12383,21 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums }\n\ }\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16_F32toBF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ +__kernel void instance_norm_BF16_F32toBF16(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t means,\n\ + __write_only image2d_array_t output,\n\ + int height)\n\ {\n\ int gidz = get_global_id(1);\n\ int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ vxc_short8 src0, src1, src2;\n\ - float scale_vari, bias_val;\n\ - float4 mean_vari = (float4)(0);\n\ + float4 coef;\n\ \n\ - Image img3 = create_image_from_image2d(meanVari, 4);\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));\n\ - __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ + coef = read_imagef(means, coord.yz);\n\ \n\ - float sval = read_imagef(scale, coord.yz).x;\n\ - float bval = read_imagef(bias, coord.yz).x;\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += vari_ptr[i];\n\ - }\n\ -\n\ - mean_vari *= inv_multiplier;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = sval * mean_vari.s1;\n\ float4 tmpData0, tmpData1;\n\ - bias_val = (bval - scale_vari * mean_vari.s0);\n\ \n\ int8 input_desc, output_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ @@ -12320,6 +12410,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_in.y ++;\n\ @@ -12331,9 +12422,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 _viv_asm(COPY, tmpData1, src2, 16);\n\ \n\ float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ + norm = tmpData0 * coef.x + coef.y;\n\ _viv_asm(COPY, src0, norm, 16);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ + norm = tmpData1 * coef.x + coef.y;\n\ _viv_asm(COPY, src1, norm, 16);\n\ VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ VXC_OP4_NoDest(img_store_3d, output, coord, src2, \\\n\ @@ -12341,41 +12432,27 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 }\n\ }\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ +__kernel void instance_norm_BF16_F32toBF16_2D(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t means,\n\ + __write_only image2d_array_t output,\n\ + int height)\n\ {\n\ int gidz = get_global_id(1);\n\ int gidy = gidz * height;\n\ int2 coord = (int2)(get_global_id(0), gidy);\n\ - int2 coord_para = (int2)(gidz, 0);\n\ + int2 coord_para = (int2)(0, gidz);\n\ int endH = gidy + height;\n\ vxc_short8 src0, src1, src2;\n\ - float scale_vari, bias_val;\n\ - float4 mean_vari = (float4)(0);\n\ + float4 coef;\n\ \n\ - Image img3 = create_image_from_image2d(meanVari, 4);\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\ - __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ + coef = read_imagef(means, coord_para);\n\ \n\ - float sval = read_imagef(scale, coord_para.yx).x;\n\ - float bval = read_imagef(bias, coord_para.yx).x;\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += vari_ptr[i];\n\ - }\n\ -\n\ - mean_vari *= inv_multiplier;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = sval * mean_vari.s1;\n\ float4 tmpData0, tmpData1;\n\ - bias_val = (bval - scale_vari * mean_vari.s0);\n\ \n\ for(; coord.y < endH; coord.y++)\n\ {\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ @@ -12386,9 +12463,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 _viv_asm(COPY, tmpData1, src2, 16);\n\ \n\ float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ + norm = tmpData0 * coef.x + coef.y;\n\ _viv_asm(COPY, src0, norm, 16);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ + norm = tmpData1 * coef.x + coef.y;\n\ _viv_asm(COPY, src1, norm, 16);\n\ VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ @@ -12547,7 +12624,7 @@ _viv_uniform int inputZP;\n\ VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\ break; \\\n\ case 4: \\\n\ - VXC_Vstore4(dst_ptr, 0, dst.0123); \\\n\ + VXC_Vstore4(dst_ptr, 0, dst.s0123); \\\n\ break; \\\n\ case 5: \\\n\ VXC_Vstore2(dst_ptr, 0, dst.s01); \\\n\ @@ -12562,7 +12639,7 @@ _viv_uniform int inputZP;\n\ VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\ break; \\\n\ case 7: \\\n\ - VXC_Vstore4(dst_ptr, 0, dst.0123); \\\n\ + VXC_Vstore4(dst_ptr, 0, dst.s0123); \\\n\ dst.s012 = dst.s456; \\\n\ dst_ptr += 4; \\\n\ VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\ @@ -20912,6 +20989,11 @@ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ _viv_uniform int ac2zero;\n\ _viv_uniform int bc2zero;\n\ \n\ +_viv_uniform VXC_512Bits uniI16MulI16SumtoI32_16x1;\n\ +_viv_uniform VXC_512Bits uniI16MulI16SumtoI32B_16x1;\n\ +_viv_uniform float inout_beta;\n\ +_viv_uniform float inout_scale;\n\ +\n\ #define GEMM_QINT_TO_QINT(src0_type_name, read_type) \\\n\ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ @@ -21004,6 +21086,142 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ GEMM_QINT_TO_QINT(I16, vxc_short8)\n\ +\n\ +__kernel void gemm_transb_I16I16toI16(image2d_array_t inputA,\n\ + image2d_array_t inputB, image2d_array_t output,\n\ + int transposeA, int transposeB, int adjointA, int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\ + {\n\ + vxc_short8 srcA0,srcA1,srcA2,srcA3;\n\ + vxc_short8 srcB0,srcB1,srcB2,srcB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 8;\n\ + coord_b.x += 8;\n\ +\n\ + vxc_int4 iVal;\n\ + vxc_float4 fpVal;\n\ + VXC_DP16x1(iVal, srcA0, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA0, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA0, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA0, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + sum0 = sum0 + fpVal * inout_scale + inout_beta;\n\ +\n\ + VXC_DP16x1(iVal, srcA1, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA1, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA1, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA1, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + sum1 = sum1 + fpVal * inout_scale + inout_beta;\n\ +\n\ + VXC_DP16x1(iVal, srcA2, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA2, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA2, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA2, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + sum2 = sum2 + fpVal * inout_scale + inout_beta;\n\ +\n\ + VXC_DP16x1(iVal, srcA3, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA3, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA3, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + VXC_DP16x1(iVal, srcA3, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32_16x1);\n\ + VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniI16MulI16SumtoI32B_16x1);\n\ + sum3 = sum3 + fpVal * inout_scale + inout_beta;\n\ + }\n\ + vxc_int4 tmpOut0, tmpOut1;\n\ + vxc_short8 valDst;\n\ + tmpOut0 = convert_int4_rte(sum0);\n\ + tmpOut1 = convert_int4_rte(sum1);\n\ + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + tmpOut0 = convert_int4_rte(sum2);\n\ + tmpOut1 = convert_int4_rte(sum3);\n\ + VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ "; /* end of matrixmul_i16_vx*/ static const char matrixmul_transA_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -27810,6 +28028,94 @@ PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\ PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\ "; /* end of pre_process_gray_copy_vx*/ +static const char pre_process_nv12_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +_viv_uniform float outputScaleVar;\n\ +_viv_uniform float bMeanScaleVarZp;\n\ +_viv_uniform float gMeanScaleVarZp;\n\ +_viv_uniform float rMeanScaleVarZp;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\ +\n\ +#define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ +__kernel void pre_process_nv12_copy_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t uv_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int* xRatio, \\\n\ + global int* yRatio, \\\n\ + global int* xOffset, \\\n\ + global int* yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int sy = gidy + (*yOffset); \\\n\ + int sx = gidx + (*xOffset); \\\n\ + int uvX = sx & 0xfffffffe; \\\n\ + int uvY = sy >> 1; \\\n\ + \\\n\ + vxc_uchar16 Y, UV; \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_char16 tmpUV; \\\n\ + short tmpVal = 128; \\\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\ + \\\n\ + float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\ + \\\n\ + conv_type result; \\\n\ + dst_type dst0; \\\n\ + save_type dst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ + dstPos.z = bOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ + dstPos.z = 1; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ + dstPos.z = rOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +NV12_COPY_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\ +NV12_COPY_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\ +NV12_COPY_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\ +NV12_COPY_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\ +"; /* end of pre_process_nv12_copy_vx*/ + static const char pre_process_nv12_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int bOrder;\n\ @@ -27820,363 +28126,6 @@ _viv_uniform float bMeanScaleVarZp;\n\ _viv_uniform float gMeanScaleVarZp;\n\ _viv_uniform float rMeanScaleVarZp;\n\ \n\ -_viv_uniform uint xrIntFloat_16;\n\ -_viv_uniform uint yrIntFloat_16;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ -\n\ -__kernel void pre_process_nv12_scale_U8toI16(\n\ - __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - uint4 gidx = get_global_id(0);\n\ - uint gidy = get_global_id(1);\n\ - gidx += (uint4)(0, 1, 2, 3);\n\ -\n\ - uint dy = (gidy * yrIntFloat_16) >> 16;\n\ - uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ - int sy = convert_int(dy) + (*yOffset);\n\ - int4 sx = convert_int4(dx) + (*xOffset);\n\ - int4 uvX = sx & 0xfffffffe;\n\ - int uvY = sy >> 1;\n\ -\n\ - vxc_uchar16 Y, UV;\n\ - int2 coord = (int2)(sx.x, sy);\n\ - int2 coord_uv = (int2)(uvX.x, uvY);\n\ -\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.y;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.z;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.w;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.y;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.z;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.w;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_char16 tmpUV;\n\ - short tmpVal = 128;\n\ - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ -\n\ - float4 tmpDstB, tmpDstG, tmpDstR;\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ -\n\ - int4 result;\n\ - vxc_short8 dst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ - dstPos.z = bOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ - dstPos.z = 1;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ - dstPos.z = rOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pre_process_nv12_scale_U8toF16(\n\ - __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - uint4 gidx = get_global_id(0);\n\ - uint gidy = get_global_id(1);\n\ - gidx += (uint4)(0, 1, 2, 3);\n\ -\n\ - uint dy = (gidy * yrIntFloat_16) >> 16;\n\ - uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ - int sy = convert_int(dy) + (*yOffset);\n\ - int4 sx = convert_int4(dx) + (*xOffset);\n\ - int4 uvX = sx & 0xfffffffe;\n\ - int uvY = sy >> 1;\n\ -\n\ - vxc_uchar16 Y, UV;\n\ - int2 coord = (int2)(sx.x, sy);\n\ - int2 coord_uv = (int2)(uvX.x, uvY);\n\ -\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.y;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.z;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.w;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.y;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.z;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.w;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_char16 tmpUV;\n\ - short tmpVal = 128;\n\ - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ -\n\ - float4 tmpDstB, tmpDstG, tmpDstR;\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ -\n\ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;\n\ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;\n\ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;\n\ -\n\ - half4 result;\n\ - vxc_half8 tmpdst;\n\ - vxc_short8 dst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - _viv_asm(CONV, result, tmpDstB);\n\ - dstPos.z = bOrder;\n\ - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpdst, 16);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(CONV, result, tmpDstG);\n\ - dstPos.z = 1;\n\ - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpdst, 16);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(CONV, result, tmpDstR);\n\ - dstPos.z = rOrder;\n\ - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpdst, 16);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pre_process_nv12_scale_vx*/ - -static const char pre_process_nv12_scale_8bits_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int bOrder;\n\ -_viv_uniform int rOrder;\n\ -\n\ -_viv_uniform float outputScaleVar;\n\ -_viv_uniform float bMeanScaleVarZp;\n\ -_viv_uniform float gMeanScaleVarZp;\n\ -_viv_uniform float rMeanScaleVarZp;\n\ -\n\ -_viv_uniform uint xrIntFloat_16;\n\ -_viv_uniform uint yrIntFloat_16;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ -_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\ -\n\ -__kernel void pre_process_nv12_scale_U8toU8(\n\ - __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - uint4 gidx = get_global_id(0);\n\ - uint gidy = get_global_id(1);\n\ - gidx += (uint4)(0, 1, 2, 3);\n\ -\n\ - uint dy = (gidy * yrIntFloat_16) >> 16;\n\ - uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ - int sy = convert_int(dy) + (*yOffset);\n\ - int4 sx = convert_int4(dx) + (*xOffset);\n\ - int4 uvX = sx & 0xfffffffe;\n\ - int uvY = sy >> 1;\n\ -\n\ - vxc_uchar16 Y, UV;\n\ - int2 coord = (int2)(sx.x, sy);\n\ - int2 coord_uv = (int2)(uvX.x, uvY);\n\ -\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.y;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.z;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.w;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.y;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.z;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.w;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_char16 tmpUV;\n\ - short tmpVal = 128;\n\ - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ -\n\ - float4 tmpDstB, tmpDstG, tmpDstR;\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ -\n\ - int4 result;\n\ - vxc_uchar8 dst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ - dstPos.z = bOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ - dstPos.z = 1;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ - dstPos.z = rOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pre_process_nv12_copy_U8toU8(\n\ - __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ -\n\ - int sy = gidy + (*yOffset);\n\ - int sx = gidx + (*xOffset);\n\ - int uvX = sx & 0xfffffffe;\n\ - int uvY = sy >> 1;\n\ -\n\ - vxc_uchar16 Y, UV;\n\ -\n\ - VXC_ReadImage(Y, y_img, (int2)(sx,sy), VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_char16 tmpUV;\n\ - short tmpVal = 128;\n\ - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8);\n\ -\n\ - float4 tmpDstB, tmpDstG, tmpDstR;\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ -\n\ - int4 result;\n\ - vxc_uchar8 dst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ - dstPos.z = bOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ - dstPos.z = 1;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ - dstPos.z = rOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pre_process_nv12_scale_U8toI8(\n\ - __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - uint4 gidx = get_global_id(0);\n\ - uint gidy = get_global_id(1);\n\ - gidx += (uint4)(0, 1, 2, 3);\n\ -\n\ - uint dy = (gidy * yrIntFloat_16) >> 16;\n\ - uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ - int sy = convert_int(dy) + (*yOffset);\n\ - int4 sx = convert_int4(dx) + (*xOffset);\n\ - int4 uvX = sx & 0xfffffffe;\n\ - int uvY = sy >> 1;\n\ -\n\ - vxc_uchar16 Y, UV;\n\ - int2 coord = (int2)(sx.x, sy);\n\ - int2 coord_uv = (int2)(uvX.x, uvY);\n\ -\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.y;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.z;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord.x = sx.w;\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.y;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.z;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_uv.x = uvX.w;\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_char16 tmpUV;\n\ - short tmpVal = 128;\n\ - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ -\n\ - float4 tmpDstB, tmpDstG, tmpDstR;\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ -\n\ - int4 result;\n\ - vxc_char8 dst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ - dstPos.z = bOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ - dstPos.z = 1;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ - dstPos.z = rOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pre_process_nv12_scale_8bits_vx*/ - -static const char pre_process_nv12_scale_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int bOrder;\n\ -_viv_uniform int rOrder;\n\ -\n\ -_viv_uniform float outputScaleVar;\n\ -_viv_uniform float bMeanScaleVarZp;\n\ -_viv_uniform float gMeanScaleVarZp;\n\ -_viv_uniform float rMeanScaleVarZp;\n\ -\n\ _viv_uniform uint xrIntFloat_16;\n\ _viv_uniform uint yrIntFloat_16;\n\ \n\ @@ -28186,149 +28135,190 @@ _viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ \n\ _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ \n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ \n\ _viv_uniform VXC_512Bits uniCalculateYShift_2x8;\n\ _viv_uniform VXC_512Bits uniCalculateUVShift_2x8;\n\ \n\ -__kernel void pre_process_nv12_scale_U8toU8_gq(\n\ - __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - uint4 gidx = get_global_id(0);\n\ - uint gidy = get_global_id(1);\n\ - gidx += (uint4)(0, 1, 2, 3);\n\ -\n\ - uint dy = (gidy * yrIntFloat_16) >> 16;\n\ - uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ - int sy = convert_int(dy) + (*yOffset);\n\ - int4 sx = convert_int4(dx) + (*xOffset);\n\ - int4 uvX = sx & 0xfffffffe;\n\ - int uvY = sy >> 1;\n\ -\n\ - vxc_uchar16 Y, UV;\n\ - int2 coord = (int2)(sx.x, sy);\n\ - int2 coord_uv = (int2)(uvX.x, uvY);\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ - vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ - int4 offsetUV = uvX - uvX.x;\n\ -\n\ - vxc_ushort8 diffY, diffUV;\n\ - _viv_asm(COPY, diffY, sx, 16);\n\ - _viv_asm(COPY, diffUV, offsetUV, 16);\n\ -\n\ - vxc_ushort8 constData = 8;\n\ - VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);\n\ - VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);\n\ - VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_char16 tmpUV;\n\ - short tmpVal = 128;\n\ - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ -\n\ - float4 tmpDstB, tmpDstG, tmpDstR;\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ -\n\ - int4 result;\n\ - vxc_uchar8 dst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\ - dstPos.z = bOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\ - dstPos.z = 1;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\ - dstPos.z = rOrder;\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +#define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ +__kernel void pre_process_nv12_scale_##name##_gq \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t uv_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int* xRatio, \\\n\ + global int* yRatio, \\\n\ + global int* xOffset, \\\n\ + global int* yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + uint4 gidx = get_global_id(0); \\\n\ + uint gidy = get_global_id(1); \\\n\ + gidx += (uint4)(0, 1, 2, 3); \\\n\ + \\\n\ + uint dy = (gidy * yrIntFloat_16) >> 16; \\\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16; \\\n\ + int sy = convert_int(dy) + (*yOffset); \\\n\ + int4 sx = convert_int4(dx) + (*xOffset); \\\n\ + int4 uvX = sx & 0xfffffffe; \\\n\ + int uvY = sy >> 1; \\\n\ + \\\n\ + vxc_uchar16 Y, UV; \\\n\ + int2 coord = (int2)(sx.x, sy); \\\n\ + int2 coord_uv = (int2)(uvX.x, uvY); \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\ + vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\ + int4 offsetUV = uvX - uvX.x; \\\n\ + \\\n\ + vxc_ushort8 diffY, diffUV; \\\n\ + _viv_asm(COPY, diffY, sx, 16); \\\n\ + _viv_asm(COPY, diffUV, offsetUV, 16); \\\n\ + \\\n\ + vxc_ushort8 constData = 8; \\\n\ + VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniCalculateYShift_2x8); \\\n\ + VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniCalculateUVShift_2x8); \\\n\ + VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_char16 tmpUV; \\\n\ + short tmpVal = 128; \\\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\ + \\\n\ + float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\ + \\\n\ + conv_type result; \\\n\ + dst_type dst0; \\\n\ + save_type dst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ + dstPos.z = bOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ + dstPos.z = 1; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ + dstPos.z = rOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ +NV12_OPT_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\ +NV12_OPT_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\ +NV12_OPT_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\ +NV12_OPT_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\ \n\ -__kernel void pre_process_nv12_scale_U8toF16_gq(\n\ - __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\ - __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - uint4 gidx = get_global_id(0);\n\ - uint gidy = get_global_id(1);\n\ - gidx += (uint4)(0, 1, 2, 3);\n\ -\n\ - uint dy = (gidy * yrIntFloat_16) >> 16;\n\ - uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\ - int sy = convert_int(dy) + (*yOffset);\n\ - int4 sx = convert_int4(dx) + (*xOffset);\n\ - int4 uvX = sx & 0xfffffffe;\n\ - int uvY = sy >> 1;\n\ -\n\ - vxc_uchar16 Y, UV;\n\ - int2 coord = (int2)(sx.x, sy);\n\ - int2 coord_uv = (int2)(uvX.x, uvY);\n\ - VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ - vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ - int4 offsetUV = uvX - uvX.x;\n\ -\n\ - vxc_ushort8 diffY, diffUV;\n\ - _viv_asm(COPY, diffY, sx, 16);\n\ - _viv_asm(COPY, diffUV, offsetUV, 16);\n\ -\n\ - vxc_ushort8 constData = 8;\n\ - VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);\n\ - VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);\n\ - VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - vxc_char16 tmpUV;\n\ - short tmpVal = 128;\n\ - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\ -\n\ - float4 tmpDstB, tmpDstG, tmpDstR;\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\ -\n\ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;\n\ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;\n\ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;\n\ -\n\ - half4 result;\n\ - vxc_half8 tmpdst;\n\ - vxc_short8 dst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - _viv_asm(CONV, result, tmpDstB);\n\ - dstPos.z = bOrder;\n\ - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpdst, 16);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(CONV, result, tmpDstG);\n\ - dstPos.z = 1;\n\ - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpdst, 16);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(CONV, result, tmpDstR);\n\ - dstPos.z = rOrder;\n\ - VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpdst, 16);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pre_process_nv12_scale_mix_vx*/ +#define NV12_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ +__kernel void pre_process_nv12_scale_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t uv_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int* xRatio, \\\n\ + global int* yRatio, \\\n\ + global int* xOffset, \\\n\ + global int* yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + uint4 gidx = get_global_id(0); \\\n\ + uint gidy = get_global_id(1); \\\n\ + gidx += (uint4)(0, 1, 2, 3); \\\n\ + \\\n\ + uint dy = (gidy * yrIntFloat_16) >> 16; \\\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16; \\\n\ + int sy = convert_int(dy) + (*yOffset); \\\n\ + int4 sx = convert_int4(dx) + (*xOffset); \\\n\ + int4 uvX = sx & 0xfffffffe; \\\n\ + int uvY = sy >> 1; \\\n\ + \\\n\ + vxc_uchar16 Y, UV; \\\n\ + int2 coord = (int2)(sx.x, sy); \\\n\ + int2 coord_uv = (int2)(uvX.x, uvY); \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x = sx.y; \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x = sx.z; \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x = sx.w; \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_uv.x = uvX.y; \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_uv.x = uvX.z; \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_uv.x = uvX.w; \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_char16 tmpUV; \\\n\ + short tmpVal = 128; \\\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\ + \\\n\ + float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\ + \\\n\ + conv_type result; \\\n\ + dst_type dst0; \\\n\ + save_type dst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ + dstPos.z = bOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ + dstPos.z = 1; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ + dstPos.z = rOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +NV12_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\ +NV12_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\ +NV12_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\ +NV12_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\ +"; /* end of pre_process_nv12_scale_vx*/ static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -29947,7 +29937,7 @@ IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16)\n\ IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16)\n\ "; /* end of pre_process_rgb_copy_vx*/ -static const char pre_process_yuv420_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char pre_process_yuv420_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;\n\ _viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4;\n\ @@ -29981,1131 +29971,921 @@ _viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8;\n\ \n\ _viv_uniform int bOrder;\n\ _viv_uniform int rOrder;\n\ -_viv_uniform int zp;\n\ -_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform float output_scale;\n\ \n\ -__kernel void pre_process_yuv420_copy_U8toU8(\n\ - __read_only image2d_t y_img,\n\ - __read_only image2d_t u_img,\n\ - __read_only image2d_t v_img,\n\ - __write_only image2d_array_t output,\n\ - global int * xRatio,\n\ - global int * yRatio,\n\ - global int * xOffset,\n\ - global int * yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float var,\n\ - int reverse_channel,\n\ - int trans\n\ - )\n\ -{\n\ - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ - int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);\n\ - vxc_uchar16 Y;\n\ - vxc_uchar8 U, V;\n\ - vxc_int4 C0, C1, C2, C3;\n\ - vxc_uchar16 R, G, B;\n\ - vxc_uchar16 dst0, dst1, dst2;\n\ +#define YUV420_COPY_SH_IMPL(name, dst_type) \\\n\ +__kernel void pre_process_yuv420_copy_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t u_img, \\\n\ + __read_only image2d_array_t v_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int * xRatio, \\\n\ + global int * yRatio, \\\n\ + global int * xOffset, \\\n\ + global int * yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\ + int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \\\n\ + vxc_uchar16 Y; \\\n\ + vxc_uchar8 U, V; \\\n\ + vxc_int4 C0, C1, C2, C3; \\\n\ + vxc_uchar16 R, G, B; \\\n\ + dst_type dst0, dst1, dst2; \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + /*C = Y - 16;*/ \\\n\ + /*D = U - 128;*/ \\\n\ + /*E = V - 128;*/ \\\n\ + /* calculate R*/ \\\n\ + /* ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]*/ \\\n\ + int tmpV = -56992; \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \\\n\ + \\\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + \\\n\ + /* calculate G*/ \\\n\ + /* ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]*/ \\\n\ + /* 298Y - 208V*/ \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \\\n\ + /* 34784 - 100U*/ \\\n\ + ushort tmpG = 34784; \\\n\ + vxc_ushort8 tmpDstG; \\\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \\\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \\\n\ + VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \\\n\ + VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \\\n\ + \\\n\ + /* calculate B*/ \\\n\ + /* ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]*/ \\\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \\\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \\\n\ + tmpV = -70688; \\\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + \\\n\ + var *= output_scale; \\\n\ + float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\ + rMean * var - output_zp, var); \\\n\ + half4 paramData_f16; \\\n\ + _viv_asm(CONV, paramData_f16, paramData); \\\n\ + \\\n\ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\ + \\\n\ + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\ + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\ + \\\n\ + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\ + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\ + \\\n\ + pos = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + pos.z = bOrder; \\\n\ + VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + pos.z = 1; \\\n\ + VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + pos.z = rOrder; \\\n\ + VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +YUV420_COPY_SH_IMPL(U8toU8, vxc_uchar16)\n\ +YUV420_COPY_SH_IMPL(U8toI8, vxc_char16)\n\ \n\ - VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +#define YUV420_COPY_16BITS_SH_IMPL(name, dst_type) \\\n\ +__kernel void pre_process_yuv420_copy_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t u_img, \\\n\ + __read_only image2d_array_t v_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int * xRatio, \\\n\ + global int * yRatio, \\\n\ + global int * xOffset, \\\n\ + global int * yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\ + int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \\\n\ + vxc_uchar16 Y; \\\n\ + vxc_uchar8 U, V; \\\n\ + vxc_int4 C0, C1, C2, C3; \\\n\ + vxc_uchar16 R, G, B; \\\n\ + dst_type dst0, dst1, dst2, dst3, dst4, dst5; \\\n\ + vxc_short8 out0, out1, out2, out3, out4, out5; \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int tmpV = -56992; \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \\\n\ + \\\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \\\n\ + \\\n\ + ushort tmpG = 34784; \\\n\ + vxc_ushort8 tmpDstG; \\\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \\\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \\\n\ + VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \\\n\ + VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \\\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \\\n\ + tmpV = -70688; \\\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + \\\n\ + var *= output_scale; \\\n\ + float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\ + rMean * var - output_zp, var); \\\n\ + half4 paramData_f16; \\\n\ + _viv_asm(CONV, paramData_f16, paramData); \\\n\ + \\\n\ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\ + VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\ + \\\n\ + VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\ + VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\ + \\\n\ + VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\ + VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\ + \\\n\ + _viv_asm(COPY, out0, dst0, 16); \\\n\ + _viv_asm(COPY, out1, dst1, 16); \\\n\ + _viv_asm(COPY, out2, dst2, 16); \\\n\ + _viv_asm(COPY, out3, dst3, 16); \\\n\ + _viv_asm(COPY, out4, dst4, 16); \\\n\ + _viv_asm(COPY, out5, dst5, 16); \\\n\ + \\\n\ + pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8); \\\n\ + VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + pos.z = 1; \\\n\ + VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + pos.z = rOrder; \\\n\ + VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +YUV420_COPY_16BITS_SH_IMPL(U8toF16, vxc_half8)\n\ +YUV420_COPY_16BITS_SH_IMPL(U8toI16, vxc_short8)\n\ +"; /* end of pre_process_yuv420_copy_vx*/ + +static const char pre_process_yuv420_scale_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ - //C = Y - 16;\n\ - //D = U - 128;\n\ - //E = V - 128;\n\ - // calculate R\n\ - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ - int tmpV = -56992;\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ \n\ - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ \n\ - // calculate G\n\ - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ - // 298Y - 208V\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ - // 34784 - 100U\n\ - ushort tmpG = 34784;\n\ - vxc_ushort8 tmpDstG;\n\ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ - VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\ - VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ \n\ - // calculate B\n\ - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ - tmpV = -70688;\n\ - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ \n\ - var *= outputScale;\n\ - float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\ - rMean * var - zp, var);\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ \n\ - VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ - VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform float output_scale;\n\ \n\ - VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ - VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ +#define YUV420_SCALE_8BITS_SH_IMPL(name, dst_type) \\\n\ +__kernel void pre_process_yuv420_scale_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t u_img, \\\n\ + __read_only image2d_array_t v_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int * xRatio, \\\n\ + global int * yRatio, \\\n\ + global int * xOffset, \\\n\ + global int * yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int4 gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + gidx += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \\\n\ + int4 sx = fx & 0xffff8000; \\\n\ + int fy, sy; \\\n\ + fx -= sx; \\\n\ + sx = sx >> 15; \\\n\ + fx = (fx +(1 << 4)) >> 5; \\\n\ + \\\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \\\n\ + sy = fy & 0xffff8000; \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + sy = sy < 0 ? 0 : sy; \\\n\ + fy = fy < 0 ? 0 : fy; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + sx += (*xOffset); \\\n\ + sy += (*yOffset); \\\n\ + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \\\n\ + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \\\n\ + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \\\n\ + \\\n\ + vxc_uchar16 Y, U, V; \\\n\ + vxc_int4 C0, C1, C2, C3; \\\n\ + vxc_uchar16 R, G, B; \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos1.x = (sx.x + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos2.x = (sx.x + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.y; \\\n\ + srcPos1.x = sx.y >> 1; \\\n\ + srcPos2.x = sx.y >> 1; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos1.x = (sx.y + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos2.x = (sx.y + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.z; \\\n\ + srcPos1.x = sx.z >> 1; \\\n\ + srcPos2.x = sx.z >> 1; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos1.x = (sx.z + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos2.x = (sx.z + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.w; \\\n\ + srcPos1.x = sx.w >> 1; \\\n\ + srcPos2.x = sx.w >> 1; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos1.x = (sx.w + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos2.x = (sx.w + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int tmpV = -56992; \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \\\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \\\n\ + \\\n\ + ushort tmpG = 34784; \\\n\ + vxc_ushort8 tmpDstG, tmpDstG1; \\\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \\\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \\\n\ + tmpV = -70688; \\\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + \\\n\ + int4 result, temp1, temp2; \\\n\ + int4 tmpData0, tmpData1; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + \\\n\ + tmpV = 1 << 19; \\\n\ + dst_type dst; \\\n\ + float4 tmpDst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - bMean) * var; \\\n\ + dstPos.z = bOrder; \\\n\ + result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - gMean) * var; \\\n\ + dstPos.z = 1; \\\n\ + result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - rMean) * var; \\\n\ + dstPos.z = rOrder; \\\n\ + result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +YUV420_SCALE_8BITS_SH_IMPL(U8toU8, vxc_uchar8)\n\ +YUV420_SCALE_8BITS_SH_IMPL(U8toI8, vxc_char8)\n\ +"; /* end of pre_process_yuv420_scale_0_vx*/ + +static const char pre_process_yuv420_scale_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ - VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ - VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ +_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ \n\ - pos = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ - pos.z = bOrder;\n\ - VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - pos.z = 1;\n\ - VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - pos.z = rOrder;\n\ - VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +#define YUV420_SCALE_16BITS_SH_IMPL(name, dst_type, conv_type) \\\n\ +__kernel void pre_process_yuv420_scale_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t u_img, \\\n\ + __read_only image2d_array_t v_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int * xRatio, \\\n\ + global int * yRatio, \\\n\ + global int * xOffset, \\\n\ + global int * yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float var, \\\n\ + int reverse_channel, \\\n\ + int trans \\\n\ + ) \\\n\ +{ \\\n\ + int4 gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + gidx += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \\\n\ + int4 sx = fx & 0xffff8000; \\\n\ + int fy, sy; \\\n\ + fx -= sx; \\\n\ + sx = sx >> 15; \\\n\ + fx = (fx +(1 << 4)) >> 5; \\\n\ + \\\n\ + fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \\\n\ + sy = fy & 0xffff8000; \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + sy = sy < 0 ? 0 : sy; \\\n\ + fy = fy < 0 ? 0 : fy; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + sx += (*xOffset); \\\n\ + sy += (*yOffset); \\\n\ + int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \\\n\ + int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \\\n\ + int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \\\n\ + \\\n\ + vxc_uchar16 Y, U, V; \\\n\ + vxc_int4 C0, C1, C2, C3; \\\n\ + vxc_uchar16 R, G, B; \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos1.x = (sx.x + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos2.x = (sx.x + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.y; \\\n\ + srcPos1.x = sx.y >> 1; \\\n\ + srcPos2.x = sx.y >> 1; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos1.x = (sx.y + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos2.x = (sx.y + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.z; \\\n\ + srcPos1.x = sx.z >> 1; \\\n\ + srcPos2.x = sx.z >> 1; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos1.x = (sx.z + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos2.x = (sx.z + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + srcPos.x = sx.w; \\\n\ + srcPos1.x = sx.w >> 1; \\\n\ + srcPos2.x = sx.w >> 1; \\\n\ + VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos1.x = (sx.w + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \\\n\ + srcPos2.x = (sx.w + 1) >> 1; \\\n\ + VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int tmpV = -56992; \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \\\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \\\n\ + \\\n\ + ushort tmpG = 34784; \\\n\ + vxc_ushort8 tmpDstG, tmpDstG1; \\\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \\\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\ + \\\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \\\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \\\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \\\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \\\n\ + tmpV = -70688; \\\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ + \\\n\ + int4 result, temp1, temp2; \\\n\ + int4 tmpData0, tmpData1; \\\n\ + dst_type tmpResult; \\\n\ + conv_type tmpVal; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + \\\n\ + tmpV = 1 << 19; \\\n\ + vxc_short8 dst; \\\n\ + float4 tmpDst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - bMean) * var; \\\n\ + dstPos.z = bOrder; \\\n\ + tmpDst = tmpDst * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\ + VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmpResult, 8); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - gMean) * var; \\\n\ + dstPos.z = 1; \\\n\ + tmpDst = tmpDst * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\ + VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmpResult, 8); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\ + temp1 = fx * tmpData0 + tmpData1; \\\n\ + VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\ + VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\ + temp2 = fx * tmpData0 + tmpData1; \\\n\ + result = fy * temp2 + (temp1 << 10); \\\n\ + VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ + tmpDst = (tmpDst - rMean) * var; \\\n\ + dstPos.z = rOrder; \\\n\ + tmpDst = tmpDst * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\ + VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmpResult, 8); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +YUV420_SCALE_16BITS_SH_IMPL(U8toF16, vxc_half8, half4)\n\ +YUV420_SCALE_16BITS_SH_IMPL(U8toI16, vxc_short8, int4)\n\ +"; /* end of pre_process_yuv420_scale_1_vx*/ + +static const char pre_process_yuv422_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +_viv_uniform float outputScaleVar;\n\ +_viv_uniform float bMeanScaleVarZp;\n\ +_viv_uniform float gMeanScaleVarZp;\n\ +_viv_uniform float rMeanScaleVarZp;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\ +\n\ +#define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ +__kernel void pre_process_yuv422_copy_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int* xRatio, \\\n\ + global int* yRatio, \\\n\ + global int* xOffset, \\\n\ + global int* yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float var, \\\n\ + int reverse_channel, \\\n\ + int trans, \\\n\ + int yuv422_type \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int sy = gidy + (*yOffset); \\\n\ + int sx = gidx + (*xOffset * 2); \\\n\ + \\\n\ + vxc_uchar8 YUV; \\\n\ + vxc_short8 tmpYUV; \\\n\ + \\\n\ + VXC_ReadImage(YUV, input, (int2)(sx,sy), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + if (yuv422_type == 1) \\\n\ + { \\\n\ + YUV.s01234567 = YUV.s10325476; \\\n\ + } \\\n\ +\\\n\ + short tmpVal = 128; \\\n\ + VXC_DP2x8(tmpYUV, YUV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\ + \\\n\ + float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \\\n\ + VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \\\n\ + VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \\\n\ + \\\n\ + conv_type result; \\\n\ + dst_type dst0; \\\n\ + save_type dst; \\\n\ + int4 dstPos = (int4)(gidx, gidy, 0, 0); \\\n\ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ + dstPos.z = bOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ + dstPos.z = 1; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ + dstPos.z = rOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +YUV422_COPY_SH_IMPL(U8toU8, vxc_uchar4, int4, vxc_uchar4, 4)\n\ +YUV422_COPY_SH_IMPL(U8toI8, vxc_char4, int4, vxc_char4, 4)\n\ +YUV422_COPY_SH_IMPL(U8toI16, vxc_short4, int4, vxc_short4, 8)\n\ +YUV422_COPY_SH_IMPL(U8toF16, vxc_half4, half4, vxc_short4, 8)\n\ +"; /* end of pre_process_yuv422_copy_vx*/ + +static const char pre_process_yuv422_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +_viv_uniform float outputScaleVar;\n\ +_viv_uniform float bMeanScaleVarZp;\n\ +_viv_uniform float gMeanScaleVarZp;\n\ +_viv_uniform float rMeanScaleVarZp;\n\ +\n\ +_viv_uniform uint xrIntFloat_16;\n\ +_viv_uniform uint yrIntFloat_16;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\ +\n\ +#define uyvy422 1\n\ +\n\ +#define YUV422_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ +__kernel void pre_process_yuv422_scale_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int* xRatio, \\\n\ + global int* yRatio, \\\n\ + global int* xOffset, \\\n\ + global int* yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float var, \\\n\ + int reverse_channel, \\\n\ + int trans, \\\n\ + int yuv422_type \\\n\ + ) \\\n\ +{ \\\n\ + int4 gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + gidx += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + uint dy = (convert_uint(gidy) * yrIntFloat_16) >> 16; \\\n\ + uint4 dx = (convert_uint4(gidx) * xrIntFloat_16) >> 16; \\\n\ + int sy = convert_int(dy) + (*yOffset); \\\n\ + int4 sx = convert_int4(dx)+ (*xOffset * 2); \\\n\ + \\\n\ + vxc_uchar4 Y; \\\n\ + vxc_uchar8 UV; \\\n\ + vxc_char8 tmpUV; \\\n\ + short tmpVal = 128; \\\n\ + int y_offset = 0; \\\n\ + int u_offset = 1; \\\n\ + int v_offset = 3; \\\n\ +\\\n\ + if (yuv422_type == uyvy422) \\\n\ + { \\\n\ + y_offset = 1; \\\n\ + u_offset = 0; \\\n\ + v_offset = 2; \\\n\ + } \\\n\ +\\\n\ + int4 coord_Y = (int4)(sx.x * 2 + y_offset, sy, 0, 0); \\\n\ + int4 coord_U = (int4)((sx.x >> 1) * 4 + u_offset, sy, 0, 0); \\\n\ + int4 coord_V = (int4)((sx.x >> 1) * 4 + v_offset, sy, 0, 0); \\\n\ +\\\n\ + VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_Y.x = sx.y * 2 + y_offset; \\\n\ + VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_Y.x = sx.z * 2 + y_offset; \\\n\ + VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_Y.x = sx.w * 2 + y_offset; \\\n\ + VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + sx = (sx >> 1) * 4 + u_offset; \\\n\ + VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_U.x = sx.y; \\\n\ + VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_U.x = sx.z; \\\n\ + VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_U.x = sx.w; \\\n\ + VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +\\\n\ + sx = sx - u_offset + v_offset; \\\n\ + VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_V.x = sx.y; \\\n\ + VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_V.x = sx.z; \\\n\ + VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_V.x = sx.w; \\\n\ + VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\ + vxc_uchar4 dst_test; \\\n\ + VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ +\\\n\ + float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \\\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \\\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \\\n\ + \\\n\ + conv_type result; \\\n\ + dst_type dst0; \\\n\ + save_type dst; \\\n\ + int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \\\n\ + tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ + dstPos.z = bOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ + dstPos.z = 1; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ + dstPos.z = rOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ \n\ -__kernel void pre_process_yuv420_copy_U8toF16(\n\ - __read_only image2d_t y_img,\n\ - __read_only image2d_t u_img,\n\ - __read_only image2d_t v_img,\n\ - __write_only image2d_array_t output,\n\ - global int * xRatio,\n\ - global int * yRatio,\n\ - global int * xOffset,\n\ - global int * yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float var,\n\ - int reverse_channel,\n\ - int trans\n\ - )\n\ -{\n\ - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ - int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);\n\ - vxc_uchar16 Y;\n\ - vxc_uchar8 U, V;\n\ - vxc_int4 C0, C1, C2, C3;\n\ - vxc_uchar16 R, G, B;\n\ - vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;\n\ - vxc_short8 out0, out1, out2, out3, out4, out5;\n\ -\n\ - VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //C = Y - 16;\n\ - //D = U - 128;\n\ - //E = V - 128;\n\ - // calculate R\n\ - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ - int tmpV = -56992;\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ -\n\ - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ -\n\ - // calculate G\n\ - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ - // 298Y - 208V\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ - // 34784 - 100U\n\ - ushort tmpG = 34784;\n\ - vxc_ushort8 tmpDstG;\n\ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ - VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\ - VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\ -\n\ - // calculate B\n\ - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ - tmpV = -70688;\n\ - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ -\n\ - float4 paramData = (float4)(bMean * var, gMean * var,\\\n\ - rMean * var, var);\n\ - half4 paramData_f16;\n\ - _viv_asm(CONV, paramData_f16, paramData);\n\ -\n\ - VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ - VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ -\n\ - VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ - VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ -\n\ - VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ - VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ -\n\ - _viv_asm(COPY, out0, dst0, 16);\n\ - _viv_asm(COPY, out1, dst1, 16);\n\ - _viv_asm(COPY, out2, dst2, 16);\n\ - _viv_asm(COPY, out3, dst3, 16);\n\ - _viv_asm(COPY, out4, dst4, 16);\n\ - _viv_asm(COPY, out5, dst5, 16);\n\ -\n\ - pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);\n\ - VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - pos.z = 1;\n\ - VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - pos.z = rOrder;\n\ - VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -"; /* end of pre_process_yuv420_copy_u8_vx*/ - -static const char pre_process_yuv420_scale_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ -_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ -\n\ -_viv_uniform int bOrder;\n\ -_viv_uniform int rOrder;\n\ -\n\ -__kernel void pre_process_yuv420_scale_U8toF16(\n\ - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ - __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - int4 gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - gidx += (int4)(0, 1, 2, 3);\n\ -\n\ - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ - int4 sx = fx & 0xffff8000; // Floor\n\ - int fy, sy;\n\ - fx -= sx;\n\ - sx = sx >> 15;\n\ - fx = (fx +(1 << 4)) >> 5;\n\ -\n\ - // for y\n\ - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ - sy = fy & 0xffff8000; // Floor\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - sy = sy < 0 ? 0 : sy;\n\ - fy = fy < 0 ? 0 : fy;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ - sx += (*xOffset);\n\ - sy += (*yOffset);\n\ - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ -\n\ - vxc_uchar16 Y, U, V;\n\ - vxc_int4 C0, C1, C2, C3;\n\ - vxc_uchar16 R, G, B;\n\ -\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.y;\n\ - srcPos1.x = sx.y >> 1;\n\ - srcPos2.x = sx.y >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.z;\n\ - srcPos1.x = sx.z >> 1;\n\ - srcPos2.x = sx.z >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.w;\n\ - srcPos1.x = sx.w >> 1;\n\ - srcPos2.x = sx.w >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //C = Y - 16; D = U - 128; E = V - 128;\n\ - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ - int tmpV = -56992;\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ -\n\ - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ - // 298Y - 208V\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ - // 34784 - 100U\n\ - ushort tmpG = 34784;\n\ - vxc_ushort8 tmpDstG, tmpDstG1;\n\ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ -\n\ - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ - tmpV = -70688;\n\ - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ -\n\ - int4 result, temp1, temp2;\n\ - int4 tmpData0, tmpData1;\n\ -\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - // temp2 - temp1\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ -\n\ - vxc_half8 tmpVal;\n\ - half4 hDst;\n\ - tmpV = 1 << 19;\n\ - vxc_short8 dst;\n\ - float4 tmpDst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - bMean) * var;\n\ - dstPos.z = bOrder;\n\ - _viv_asm(CONV, hDst, tmpDst);\n\ - VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpVal, 16);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - gMean) * var;\n\ - dstPos.z = 1;\n\ - _viv_asm(CONV, hDst, tmpDst);\n\ - VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpVal, 16);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - rMean) * var;\n\ - dstPos.z = rOrder;\n\ - _viv_asm(CONV, hDst, tmpDst);\n\ - VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpVal, 16);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pre_process_yuv420_scale_fp16_vx*/ - -static const char pre_process_yuv420_scale_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ -\n\ -_viv_uniform int bOrder;\n\ -_viv_uniform int rOrder;\n\ -_viv_uniform float outputScale;\n\ -\n\ -__kernel void pre_process_yuv420_scale_U8toI16(\n\ - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ - __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - int4 gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - gidx += (int4)(0, 1, 2, 3);\n\ -\n\ - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ - int4 sx = fx & 0xffff8000; // Floor\n\ - int fy, sy;\n\ - fx -= sx;\n\ - sx = sx >> 15;\n\ - fx = (fx +(1 << 4)) >> 5;\n\ -\n\ - // for y\n\ - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ - sy = fy & 0xffff8000; // Floor\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - sy = sy < 0 ? 0 : sy;\n\ - fy = fy < 0 ? 0 : fy;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ - sx += (*xOffset);\n\ - sy += (*yOffset);\n\ - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ -\n\ - vxc_uchar16 Y, U, V;\n\ - vxc_int4 C0, C1, C2, C3;\n\ - vxc_uchar16 R, G, B;\n\ -\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.y;\n\ - srcPos1.x = sx.y >> 1;\n\ - srcPos2.x = sx.y >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.z;\n\ - srcPos1.x = sx.z >> 1;\n\ - srcPos2.x = sx.z >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.w;\n\ - srcPos1.x = sx.w >> 1;\n\ - srcPos2.x = sx.w >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //C = Y - 16; D = U - 128; E = V - 128;\n\ - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ - int tmpV = -56992;\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ -\n\ - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ - // 298Y - 208V\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ - // 34784 - 100U\n\ - ushort tmpG = 34784;\n\ - vxc_ushort8 tmpDstG, tmpDstG1;\n\ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ -\n\ - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ - tmpV = -70688;\n\ - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ -\n\ - int4 result, temp1, temp2;\n\ - int4 tmpData0, tmpData1;\n\ -\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - // temp2 - temp1\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ -\n\ - tmpV = 1 << 19;\n\ - vxc_short8 dst;\n\ - float4 tmpDst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - bMean) * var;\n\ - dstPos.z = bOrder;\n\ - result = convert_int4_rte(tmpDst * outputScale);\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - gMean) * var;\n\ - dstPos.z = 1;\n\ - result = convert_int4_rte(tmpDst * outputScale);\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - rMean) * var;\n\ - dstPos.z = rOrder;\n\ - result = convert_int4_rte(tmpDst * outputScale);\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pre_process_yuv420_scale_i16_vx*/ - -static const char pre_process_yuv420_scale_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ -\n\ -_viv_uniform int bOrder;\n\ -_viv_uniform int rOrder;\n\ -_viv_uniform float outputScale;\n\ -\n\ -__kernel void pre_process_yuv420_scale_U8toI8(\n\ - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ - __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - int4 gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - gidx += (int4)(0, 1, 2, 3);\n\ -\n\ - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ - int4 sx = fx & 0xffff8000; // Floor\n\ - int fy, sy;\n\ - fx -= sx;\n\ - sx = sx >> 15;\n\ - fx = (fx +(1 << 4)) >> 5;\n\ -\n\ - // for y\n\ - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ - sy = fy & 0xffff8000; // Floor\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - sy = sy < 0 ? 0 : sy;\n\ - fy = fy < 0 ? 0 : fy;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ - sx += (*xOffset);\n\ - sy += (*yOffset);\n\ - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ -\n\ - vxc_uchar16 Y, U, V;\n\ - vxc_int4 C0, C1, C2, C3;\n\ - vxc_uchar16 R, G, B;\n\ -\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.y;\n\ - srcPos1.x = sx.y >> 1;\n\ - srcPos2.x = sx.y >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.z;\n\ - srcPos1.x = sx.z >> 1;\n\ - srcPos2.x = sx.z >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.w;\n\ - srcPos1.x = sx.w >> 1;\n\ - srcPos2.x = sx.w >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //C = Y - 16; D = U - 128; E = V - 128;\n\ - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ - int tmpV = -56992;\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ -\n\ - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ - // 298Y - 208V\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ - // 34784 - 100U\n\ - ushort tmpG = 34784;\n\ - vxc_ushort8 tmpDstG, tmpDstG1;\n\ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ -\n\ - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ - tmpV = -70688;\n\ - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ -\n\ - int4 result, temp1, temp2;\n\ - int4 tmpData0, tmpData1;\n\ -\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - // temp2 - temp1\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ -\n\ - tmpV = 1 << 19;\n\ - vxc_char8 dst;\n\ - float4 tmpDst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - bMean) * var;\n\ - dstPos.z = bOrder;\n\ - result = convert_int4_rte(tmpDst * outputScale);\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - gMean) * var;\n\ - dstPos.z = 1;\n\ - result = convert_int4_rte(tmpDst * outputScale);\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - rMean) * var;\n\ - dstPos.z = rOrder;\n\ - result = convert_int4_rte(tmpDst * outputScale);\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pre_process_yuv420_scale_i8_vx*/ - -static const char pre_process_yuv420_scale_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\ -\n\ -_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\ -\n\ -_viv_uniform int bOrder;\n\ -_viv_uniform int rOrder;\n\ -_viv_uniform int zp;\n\ -_viv_uniform float outputScale;\n\ -\n\ -__kernel void pre_process_yuv420_scale_U8toU8(\n\ - __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\ - __read_only image2d_array_t v_img, __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ -{\n\ - int4 gidx = get_global_id(0);\n\ - int gidy = get_global_id(1);\n\ - gidx += (int4)(0, 1, 2, 3);\n\ -\n\ - int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\ - int4 sx = fx & 0xffff8000; // Floor\n\ - int fy, sy;\n\ - fx -= sx;\n\ - sx = sx >> 15;\n\ - fx = (fx +(1 << 4)) >> 5;\n\ -\n\ - // for y\n\ - fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\ - sy = fy & 0xffff8000; // Floor\n\ - fy -= sy;\n\ - sy = sy >> 15;\n\ -\n\ - sy = sy < 0 ? 0 : sy;\n\ - fy = fy < 0 ? 0 : fy;\n\ -\n\ - fy = (fy + (1<< 4)) >> 5;\n\ - sx += (*xOffset);\n\ - sy += (*yOffset);\n\ - int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\ - int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\ - int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\ -\n\ - vxc_uchar16 Y, U, V;\n\ - vxc_int4 C0, C1, C2, C3;\n\ - vxc_uchar16 R, G, B;\n\ -\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.x + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.y;\n\ - srcPos1.x = sx.y >> 1;\n\ - srcPos2.x = sx.y >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.y + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.z;\n\ - srcPos1.x = sx.z >> 1;\n\ - srcPos2.x = sx.z >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.z + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - srcPos.x = sx.w;\n\ - srcPos1.x = sx.w >> 1;\n\ - srcPos2.x = sx.w >> 1;\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\ - srcPos1.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\ - srcPos2.x = (sx.w + 1) >> 1;\n\ - VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - //C = Y - 16; D = U - 128; E = V - 128;\n\ - // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ - int tmpV = -56992;\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\ - VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ - VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ -\n\ - // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ - // 298Y - 208V\n\ - VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\ - VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\ - // 34784 - 100U\n\ - ushort tmpG = 34784;\n\ - vxc_ushort8 tmpDstG, tmpDstG1;\n\ - VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ - VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\ - VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ - VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\ - VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\ -\n\ - // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ - VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\ - VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\ - VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\ - VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\ - tmpV = -70688;\n\ - VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ - VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ -\n\ - int4 result, temp1, temp2;\n\ - int4 tmpData0, tmpData1;\n\ -\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - // temp2 - temp1\n\ - VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ -\n\ - tmpV = 1 << 19;\n\ - vxc_uchar8 dst;\n\ - float4 tmpDst;\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - bMean) * var;\n\ - dstPos.z = bOrder;\n\ - result = convert_int4_rte(tmpDst * outputScale + zp);\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - gMean) * var;\n\ - dstPos.z = 1;\n\ - result = convert_int4_rte(tmpDst * outputScale + zp);\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\ - temp1 = fx * tmpData0 + tmpData1;\n\ - VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\ - VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\ - temp2 = fx * tmpData0 + tmpData1;\n\ - result = fy * temp2 + (temp1 << 10);\n\ - VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - rMean) * var;\n\ - dstPos.z = rOrder;\n\ - result = convert_int4_rte(tmpDst * outputScale + zp);\n\ - VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pre_process_yuv420_scale_u8_vx*/ +YUV422_SH_IMPL(U8toU8, vxc_uchar4, int4, vxc_uchar4, 4)\n\ +YUV422_SH_IMPL(U8toI8, vxc_char4, int4, vxc_char4, 4)\n\ +YUV422_SH_IMPL(U8toI16, vxc_short4, int4, vxc_short4, 8)\n\ +YUV422_SH_IMPL(U8toF16, vxc_half4, half4, vxc_short4, 8)\n\ +"; /* end of pre_process_yuv422_scale_vx*/ static const char pre_process_yuv444_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -40004,15 +39784,15 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ vxc_ushort8 mp0, mp1; \\\n\ _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\ _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ - read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(src0, input0, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(src1, input1, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniU8MulAndPostShift0_Lo_2x8); \\\n\ VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniU8MulAndPostShift1_Lo_2x8); \\\n\ - read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(value_tmp, condition, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP2x8(value, value_tmp, value_tmp,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\ @@ -40052,11 +39832,11 @@ SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8)\n\ #define SELECT_HALF(read_fun, write_fun) \\\n\ vxc_short8 src0, src1, dst, value; \\\n\ vxc_char8 value_tmp; \\\n\ - read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(src0, input0, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(src1, input1, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(value_tmp, condition, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP2x8(value, value_tmp, value_tmp,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\ @@ -40083,37 +39863,36 @@ __kernel void select_I8_F16_F16toF16_2D(\n\ SELECT_HALF(VXC_ReadImage, VXC_WriteImage)\n\ }\n\ \n\ -#define SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, read_fun, write_fun) \\\n\ - vxc_short8 src0, src1, dst, value; \\\n\ - vxc_half8 value0, value1; \\\n\ - src0_type r0; \\\n\ - src1_type r1; \\\n\ +#define SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, read_fun, write_fun) \\\n\ + save_type dst, value; \\\n\ + save_type dst0, dst1; \\\n\ + dst_type value0, value1; \\\n\ + src0_type src0; \\\n\ + src1_type src1; \\\n\ copy0_type v0; \\\n\ copy1_type v1; \\\n\ vxc_char8 value_tmp; \\\n\ vxc_ushort8 mp0, mp1; \\\n\ _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\ _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ - read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, v0, src0, 16); \\\n\ - read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, v1, src1, 16); \\\n\ VXC_DP2x8(value0, v0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniU8MulAndPostShift0_Lo_2x8); \\\n\ - _viv_asm(COPY, src0, value0, 16); \\\n\ + _viv_asm(COPY, dst0, value0, 16); \\\n\ VXC_DP2x8(value1, v1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniU8MulAndPostShift1_Lo_2x8); \\\n\ - _viv_asm(COPY, src1, value1, 16); \\\n\ - read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + _viv_asm(COPY, dst1, value1, 16); \\\n\ + read_fun(value_tmp, condition, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP2x8(value, value_tmp, value_tmp,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\ - dst = (value != 0 ? src0 : src1); \\\n\ + dst = (value != 0 ? dst0 : dst1); \\\n\ write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ -#define SELECT_HYBRID_TOF16_FUN(name, src0_type, copy0_type, src1_type, copy1_type) \\\n\ +#define SELECT_HYBRID_FUN(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \\\n\ __kernel void select_##name( \\\n\ __read_only image2d_array_t condition, \\\n\ __read_only image2d_array_t input0, \\\n\ @@ -40121,44 +39900,62 @@ __kernel void select_##name( \\\n\ __write_only image2d_array_t output) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ - SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \\\n\ + SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type,\\\n\ VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\ }\n\ -SELECT_HYBRID_TOF16_FUN(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16)\n\ -SELECT_HYBRID_TOF16_FUN(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8)\n\ -SELECT_HYBRID_TOF16_FUN(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16)\n\ -SELECT_HYBRID_TOF16_FUN(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8)\n\ -SELECT_HYBRID_TOF16_FUN(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ -SELECT_HYBRID_TOF16_FUN(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8)\n\ +SELECT_HYBRID_FUN(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8)\n\ +SELECT_HYBRID_FUN(I8_U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8)\n\ +SELECT_HYBRID_FUN(I8_F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char8, vxc_char8)\n\ +SELECT_HYBRID_FUN(I8_I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char8, vxc_char8)\n\ +SELECT_HYBRID_FUN(I8_F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_U8_U8toF16, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_I8_I8toF16, vxc_char8, vxc_char8, vxc_char8, vxc_char8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN(I8_I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8)\n\ \n\ -#define SELECT_HYBRID_TOF16_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type) \\\n\ -__kernel void select_##name( \\\n\ +#define SELECT_HYBRID_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \\\n\ +__kernel void select_##name##_2D( \\\n\ __read_only image2d_array_t condition, \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ __write_only image2d_array_t output) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ - SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \\\n\ + SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, \\\n\ VXC_ReadImage, VXC_WriteImage) \\\n\ }\n\ -SELECT_HYBRID_TOF16_FUN_2D(I8_F16_U8toF16_2D, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16)\n\ -SELECT_HYBRID_TOF16_FUN_2D(I8_U8_F16toF16_2D, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8)\n\ -SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I8toF16_2D, vxc_short8, vxc_half8, vxc_char16, vxc_char16)\n\ -SELECT_HYBRID_TOF16_FUN_2D(I8_I8_F16toF16_2D, vxc_char16, vxc_char16, vxc_short8, vxc_half8)\n\ -SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I16toF16_2D, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ -SELECT_HYBRID_TOF16_FUN_2D(I8_I16_F16toF16_2D, vxc_short8, vxc_short8, vxc_short8, vxc_half8)\n\ +SELECT_HYBRID_FUN_2D(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8)\n\ +SELECT_HYBRID_FUN_2D(I8_U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar8, vxc_uchar8)\n\ +SELECT_HYBRID_FUN_2D(I8_F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char8, vxc_char8)\n\ +SELECT_HYBRID_FUN_2D(I8_I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char8, vxc_char8)\n\ +SELECT_HYBRID_FUN_2D(I8_F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_U8_U8toF16, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_I8_I8toF16, vxc_char8, vxc_char8, vxc_char8, vxc_char8, vxc_half8, vxc_short8)\n\ +SELECT_HYBRID_FUN_2D(I8_I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8)\n\ \n\ #define SELECT_HALF_TO_QINT(read_fun, write_fun, dst_type) \\\n\ vxc_short8 src0, src1, tmp_dst, value; \\\n\ vxc_half8 data; \\\n\ dst_type dst; \\\n\ vxc_char8 value_tmp; \\\n\ - read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(src0, input0, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(src1, input1, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(value_tmp, condition, coord, 0, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP2x8(value, value_tmp, value_tmp,\\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\ @@ -44100,6 +43897,289 @@ __kernel void batch_norm_I32to##TYPE##_2D \\\n\ BATCH_NORM_I32_SH_IMPL_2D(I32)\n\ BATCH_NORM_I32_SH_IMPL_2D(F32)"; /* end of batchnorm_single_cl*/ +static const char bucketize_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ +\n\ +#define BUCKETIZE_F32_2D_SH_IMPL(name, comp_op) \\\n\ +__kernel void bucketize_##name \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_t output, \\\n\ + int boundaries_size, \\\n\ + float input0_scale, \\\n\ + float input0_tail, \\\n\ + float input1_scale, \\\n\ + float input1_tail \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + float4 src0 = read_imagef(input, coord); \\\n\ + \\\n\ + int2 pos = 0; \\\n\ + do \\\n\ + { \\\n\ + float4 src1 = read_imagef(boundaries, pos); \\\n\ + if ((src0.x) comp_op (src1.x)) \\\n\ + { \\\n\ + break; \\\n\ + } \\\n\ + pos.x ++; \\\n\ + } while(pos.x < boundaries_size); \\\n\ + \\\n\ + write_imagei(output, coord, pos.xxxx); \\\n\ +}\n\ +BUCKETIZE_F32_2D_SH_IMPL(F32_F32toI32_2D, <=)\n\ +BUCKETIZE_F32_2D_SH_IMPL(right_F32_F32toI32_2D, <)\n\ +\n\ +#define BUCKETIZE_F32_SH_IMPL(name, comp_op) \\\n\ +__kernel void bucketize_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_array_t output, \\\n\ + int boundaries_size, \\\n\ + float input0_scale, \\\n\ + float input0_tail, \\\n\ + float input1_scale, \\\n\ + float input1_tail \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + float4 src0 = read_imagef(input, coord); \\\n\ + \\\n\ + int2 pos = 0; \\\n\ + do \\\n\ + { \\\n\ + float4 src1 = read_imagef(boundaries, pos); \\\n\ + if ((src0.x) comp_op (src1.x)) \\\n\ + { \\\n\ + break; \\\n\ + } \\\n\ + pos.x ++; \\\n\ + } while(pos.x < boundaries_size); \\\n\ + \\\n\ + write_imagei(output, coord, pos.xxxx); \\\n\ +}\n\ +BUCKETIZE_F32_SH_IMPL(F32_F32toI32, <=)\n\ +BUCKETIZE_F32_SH_IMPL(right_F32_F32toI32, <)\n\ +\n\ +#define BUCKETIZE_I32_2D_SH_IMPL(name, comp_op) \\\n\ +__kernel void bucketize_##name \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_t output, \\\n\ + int boundaries_size, \\\n\ + float input0_scale, \\\n\ + float input0_tail, \\\n\ + float input1_scale, \\\n\ + float input1_tail \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + float4 src0 = convert_float4(read_imagei(input, coord)); \\\n\ + \\\n\ + int2 pos = 0; \\\n\ + src0 = src0 * input0_scale + input0_tail; \\\n\ + do \\\n\ + { \\\n\ + float4 src1 = convert_float4(read_imagei(boundaries, pos)); \\\n\ + src1 = src1 * input1_scale + input1_tail; \\\n\ + if ((src0.x) comp_op (src1.x)) \\\n\ + { \\\n\ + break; \\\n\ + } \\\n\ + pos.x ++; \\\n\ + } while(pos.x < boundaries_size); \\\n\ + \\\n\ + write_imagei(output, coord, pos.xxxx); \\\n\ +}\n\ +BUCKETIZE_I32_2D_SH_IMPL(I32_I32toI32_2D, <=)\n\ +BUCKETIZE_I32_2D_SH_IMPL(right_I32_I32toI32_2D, <)\n\ +\n\ +#define BUCKETIZE_I32_SH_IMPL(name, comp_op) \\\n\ +__kernel void bucketize_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_array_t output, \\\n\ + int boundaries_size, \\\n\ + float input0_scale, \\\n\ + float input0_tail, \\\n\ + float input1_scale, \\\n\ + float input1_tail \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + int4 data = read_imagei(input, coord); \\\n\ + float4 src0 = convert_float4(data) * input0_scale + input0_tail; \\\n\ + \\\n\ + int2 pos = 0; \\\n\ + do \\\n\ + { \\\n\ + float4 src1 = convert_float4(read_imagei(boundaries, pos)); \\\n\ + src1 = src1 * input1_scale + input1_tail; \\\n\ + if ((src0.x) comp_op (src1.x)) \\\n\ + { \\\n\ + break; \\\n\ + } \\\n\ + pos.x ++; \\\n\ + } while(pos.x < boundaries_size); \\\n\ + \\\n\ + write_imagei(output, coord, pos.xxxx); \\\n\ +}\n\ +BUCKETIZE_I32_SH_IMPL(I32_I32toI32, <=)\n\ +BUCKETIZE_I32_SH_IMPL(right_I32_I32toI32, <)\n\ +\n\ +#define BUCKETIZE_U32_2D_SH_IMPL(name, comp_op) \\\n\ +__kernel void bucketize_##name \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_t output, \\\n\ + int boundaries_size, \\\n\ + float input0_scale, \\\n\ + float input0_tail, \\\n\ + float input1_scale, \\\n\ + float input1_tail \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + float4 src0 = convert_float4(read_imageui(input, coord)); \\\n\ + \\\n\ + int2 pos = 0; \\\n\ + src0 = src0 * input0_scale + input0_tail; \\\n\ + do \\\n\ + { \\\n\ + float4 src1 = convert_float4(read_imageui(boundaries, pos)); \\\n\ + src1 = src1 * input1_scale + input1_tail; \\\n\ + if ((src0.x) comp_op (src1.x)) \\\n\ + { \\\n\ + break; \\\n\ + } \\\n\ + pos.x ++; \\\n\ + } while(pos.x < boundaries_size); \\\n\ + \\\n\ + write_imagei(output, coord, pos.xxxx); \\\n\ +}\n\ +BUCKETIZE_U32_2D_SH_IMPL(U32_U32toI32_2D, <=)\n\ +BUCKETIZE_U32_2D_SH_IMPL(right_U32_U32toI32_2D, <)\n\ +\n\ +#define BUCKETIZE_U32_SH_IMPL(name, comp_op) \\\n\ +__kernel void bucketize_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_array_t output, \\\n\ + int boundaries_size, \\\n\ + float input0_scale, \\\n\ + float input0_tail, \\\n\ + float input1_scale, \\\n\ + float input1_tail \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + uint4 data = read_imageui(input, coord); \\\n\ + float4 src0 = convert_float4(data) * input0_scale + input0_tail; \\\n\ + \\\n\ + int2 pos = 0; \\\n\ + do \\\n\ + { \\\n\ + float4 src1 = convert_float4(read_imageui(boundaries, pos)); \\\n\ + src1 = src1 * input1_scale + input1_tail; \\\n\ + if ((src0.x) comp_op (src1.x)) \\\n\ + { \\\n\ + break; \\\n\ + } \\\n\ + pos.x ++; \\\n\ + } while(pos.x < boundaries_size); \\\n\ + \\\n\ + write_imagei(output, coord, pos.xxxx); \\\n\ +}\n\ +BUCKETIZE_U32_SH_IMPL(U32_U32toI32, <=)\n\ +BUCKETIZE_U32_SH_IMPL(right_U32_U32toI32, <)\n\ +\n\ +#define BUCKETIZE_BF16_2D_SH_IMPL(name, comp_op) \\\n\ +__kernel void bucketize_##name \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_t output, \\\n\ + int boundaries_size, \\\n\ + float input0_scale, \\\n\ + float input0_tail, \\\n\ + float input1_scale, \\\n\ + float input1_tail \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + uint4 data0 = read_imageui(input, coord) << 16; \\\n\ + float4 src0; \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + \\\n\ + int2 pos = 0; \\\n\ + do \\\n\ + { \\\n\ + uint4 data1 = read_imageui(boundaries, pos) << 16; \\\n\ + float4 src1; \\\n\ + _viv_asm(COPY, src1, data1, 16); \\\n\ + if ((src0.x) comp_op (src1.x)) \\\n\ + { \\\n\ + break; \\\n\ + } \\\n\ + pos.x ++; \\\n\ + } while(pos.x < boundaries_size); \\\n\ + \\\n\ + write_imagei(output, coord, pos.xxxx); \\\n\ +}\n\ +BUCKETIZE_BF16_2D_SH_IMPL(BF16_BF16toI32_2D, <=)\n\ +BUCKETIZE_BF16_2D_SH_IMPL(right_BF16_BF16toI32_2D, <)\n\ +\n\ +#define BUCKETIZE_BF16_SH_IMPL(name, comp_op) \\\n\ +__kernel void bucketize_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boundaries, \\\n\ + __write_only image2d_array_t output, \\\n\ + int boundaries_size, \\\n\ + float input0_scale, \\\n\ + float input0_tail, \\\n\ + float input1_scale, \\\n\ + float input1_tail \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + uint4 data0 = read_imageui(input, coord) << 16; \\\n\ + float4 src0; \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + \\\n\ + int2 pos = 0; \\\n\ + do \\\n\ + { \\\n\ + uint4 data1 = read_imageui(boundaries, pos) << 16; \\\n\ + float4 src1; \\\n\ + _viv_asm(COPY, src1, data1, 16); \\\n\ + if ((src0.x) comp_op (src1.x)) \\\n\ + { \\\n\ + break; \\\n\ + } \\\n\ + pos.x ++; \\\n\ + } while(pos.x < boundaries_size); \\\n\ + \\\n\ + write_imagei(output, coord, pos.xxxx); \\\n\ +}\n\ +BUCKETIZE_BF16_SH_IMPL(BF16_BF16toI32, <=)\n\ +BUCKETIZE_BF16_SH_IMPL(right_BF16_BF16toI32, <)\n\ +"; /* end of bucketize_cl*/ + static const char cast_cl[] = "\n\ #define CAST_FUN(src_name, dst_name, src_type, dst_type, conv_fun, read_fun, write_fun) \\\n\ __kernel void cast_##src_name##to##dst_name( \\\n\ @@ -50206,6 +50286,123 @@ TENSORLOGICAL_2D(and, &&, )\n\ TENSORLOGICAL_2D(xor, ^, !!)\n\ "; /* end of logical_ops_cl*/ +static const char lppool_cl[] = "\n\ +#define LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type) \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int hstart = gidy * stride_y - pad_top; \\\n\ + int wstart = gidx * stride_x - pad_left; \\\n\ + int hend = min(hstart + ksize_y, height); \\\n\ + int wend = min(wstart + ksize_x, width); \\\n\ + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); \\\n\ + int4 coord_in = coord_out; \\\n\ + int h, w; \\\n\ + float sum_of_pow = 0; \\\n\ + dst_type out_data = (dst_type)(0); \\\n\ + src_type in_data; \\\n\ + float in_f32, out_f32; \\\n\ + hstart = max(hstart, 0); \\\n\ + wstart = max(wstart, 0); \\\n\ + for (h = hstart; h < hend; h++) \\\n\ + { \\\n\ + for (w = wstart; w < wend; w++) \\\n\ + { \\\n\ + coord_in.xy = (int2)(w, h); \\\n\ + in_data = readimage_type(input, coord_in).x; \\\n\ + in_f32 = convert_float(in_data) * inputScale + inputTail; \\\n\ + sum_of_pow += pow(fabs(in_f32),p); \\\n\ + } \\\n\ + } \\\n\ + out_f32 = pow(sum_of_pow, 1.0f / p) * outputScale + outputTail; \\\n\ + out_data.x = conv_mode(out_f32); \\\n\ + writeimage_type(output, coord_out, out_data); \\\n\ +\n\ +#define TENSOR_LPPOOL(src_name, dst_name, src_type, dst_type, readimage_type, conv_mode, writeimage_type) \\\n\ +__kernel void lppool_##src_name##to##dst_name ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int ksize_x, \\\n\ + int ksize_y, \\\n\ + int stride_x, \\\n\ + int stride_y, \\\n\ + int pad_left, \\\n\ + int pad_top, \\\n\ + int p, \\\n\ + int width, \\\n\ + int height, \\\n\ + float inputScale, \\\n\ + float inputTail, \\\n\ + float outputScale, \\\n\ + float outputTail) \\\n\ +{ \\\n\ + LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type); \\\n\ +}\n\ +\n\ +TENSOR_LPPOOL(F32, F32, float, float4, read_imagef, convert_float, write_imagef)\n\ +TENSOR_LPPOOL(F32, U32, float, uint4, read_imagef, convert_uint, write_imageui)\n\ +TENSOR_LPPOOL(F32, I32, float, int4, read_imagef, convert_int, write_imagei)\n\ +\n\ +TENSOR_LPPOOL(U32, U32, uint, uint4, read_imageui, convert_uint, write_imageui)\n\ +TENSOR_LPPOOL(U32, F32, uint, float4, read_imageui, convert_float, write_imagef)\n\ +TENSOR_LPPOOL(U32, I32, uint, int4, read_imageui, convert_int, write_imagei)\n\ +\n\ +TENSOR_LPPOOL(I32, I32, int, int4, read_imagei, convert_int, write_imagei)\n\ +TENSOR_LPPOOL(I32, F32, int, float4, read_imagei, convert_float, write_imagef)\n\ +TENSOR_LPPOOL(I32, U32, int, uint4, read_imagei, convert_uint, write_imageui)\n\ +\n\ +__kernel void lppool_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int ksize_x,\n\ + int ksize_y,\n\ + int stride_x,\n\ + int stride_y,\n\ + int pad_left,\n\ + int pad_top,\n\ + int p,\n\ + int width,\n\ + int height,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputTail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int hstart = gidy * stride_y - pad_top;\n\ + int wstart = gidx * stride_x - pad_left;\n\ + int hend = min(hstart + ksize_y, height);\n\ + int wend = min(wstart + ksize_x, width);\n\ + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\ + int4 coord_in = coord_out;\n\ + int h, w;\n\ + float sum_of_pow = 0;\n\ + float out_data_f32 = 0;\n\ + uint4 dst = (uint4)(0);\n\ + float4 data_f32 = (float4)(0);\n\ + uint4 data;\n\ + hstart = max(hstart, 0);\n\ + wstart = max(wstart, 0);\n\ +\n\ + for (h = hstart; h < hend; h++)\n\ + {\n\ + for (w = wstart; w < wend; w++)\n\ + {\n\ + coord_in.xy = (int2)(w, h);\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, data_f32, data, 16);\n\ + sum_of_pow += pow(abs(data_f32.x),p);\n\ + }\n\ + }\n\ + out_data_f32 = pow(sum_of_pow, 1.0f / p);\n\ + _viv_asm(COPY, dst, out_data_f32, 4);\n\ + dst.x = dst.x >> 16;\n\ + write_imageui(output, coord_out, dst);\n\ +}\n\ +\n\ +"; /* end of lppool_cl*/ + static const char lstmunit_activation_BP_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\ {\n\ x *= -logE;\n\ @@ -53543,7 +53740,7 @@ __kernel void maximum_I32I32toI32\n\ float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ float4 data = data0 > data1 ? data0 : data1;\n\ - int4 dst = convert_int4(data * outputScale + outputZP);\n\ + int4 dst = convert_int4_rte(data * outputScale + outputZP);\n\ \n\ write_imagei(output, coord, dst);\n\ }\n\ @@ -53569,7 +53766,7 @@ __kernel void maximum_I32I32toI32_2D\n\ float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ float4 data = data0 > data1 ? data0 : data1;\n\ - int4 dst = convert_int4(data * outputScale + outputZP);\n\ + int4 dst = convert_int4_rte(data * outputScale + outputZP);\n\ \n\ write_imagei(output, coord, dst);\n\ }\n\ @@ -54086,7 +54283,7 @@ __kernel void minimum_I32I32toI32\n\ float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ float4 data = data0 < data1 ? data0 : data1;\n\ - int4 dst = convert_int4(data * outputScale + outputZP);\n\ + int4 dst = convert_int4_rte(data * outputScale + outputZP);\n\ \n\ write_imagei(output, coord, dst);\n\ }\n\ @@ -54112,7 +54309,7 @@ __kernel void minimum_I32I32toI32_2D\n\ float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\ float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\ float4 data = data0 < data1 ? data0 : data1;\n\ - int4 dst = convert_int4(data * outputScale + outputZP);\n\ + int4 dst = convert_int4_rte(data * outputScale + outputZP);\n\ \n\ write_imagei(output, coord, dst);\n\ }\n\ @@ -58290,15 +58487,17 @@ __kernel void resize_nearest_U8toU8(\n\ }\n\ "; /* end of resize_nearest_cl*/ -static const char roi_align_cl[] = "inline float roi_align_1x1\n\ +static const char roi_align_cl[] = "\n\ +inline float roi_align_1x1\n\ (\n\ __read_only image2d_array_t input,\n\ - float2 region_start,\n\ - float2 region_end,\n\ - float2 bin_size,\n\ - int2 grid_size,\n\ - float2 rcp_of_grid_size,\n\ - int pz\n\ + float2 region_start,\n\ + float2 region_end,\n\ + float2 bin_size,\n\ + int2 grid_size,\n\ + float2 rcp_of_grid_size,\n\ + int pz,\n\ + int4 max_spatial_dims\n\ )\n\ {\n\ float sum = 0;\n\ @@ -58313,15 +58512,24 @@ static const char roi_align_cl[] = "inline float roi_align_1x1\n\ int2 xy_low = convert_int2(pos);\n\ int2 xy_high = xy_low + 1;\n\ \n\ - float ly = pos.y - xy_low.y;\n\ - float lx = pos.x - xy_low.x;\n\ - float hy = 1.0f - ly;\n\ - float hx = 1.0f - lx;\n\ + if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||\n\ + xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )\n\ + {\n\ + continue;\n\ + }\n\ +\n\ + float2 lxy = pos - floor(pos);\n\ + float2 zero = 0;\n\ +\n\ + lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;\n\ +\n\ + float hy = 1.0f - lxy.y;\n\ + float hx = 1.0f - lxy.x;\n\ \n\ float w1 = hy * hx;\n\ - float w2 = hy * lx;\n\ - float w3 = ly * hx;\n\ - float w4 = ly * lx;\n\ + float w2 = lxy.x - lxy.x * lxy.y;\n\ + float w3 = lxy.y - lxy.x * lxy.y;\n\ + float w4 = lxy.y * lxy.x;\n\ \n\ float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;\n\ float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;\n\ @@ -58335,8 +58543,9 @@ static const char roi_align_cl[] = "inline float roi_align_1x1\n\ return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);\n\ }\n\ \n\ -\n\ #define EPS_GRID 0.00001f\n\ +#define TYPE_FLOAT16 (1)\n\ +#define TYPE_FLOAT32 (2)\n\ __kernel void roi_align_F32_F32toF32\n\ (\n\ __read_only image2d_array_t input,\n\ @@ -58349,13 +58558,14 @@ __kernel void roi_align_F32_F32toF32\n\ float output_zp,\n\ float spatial_x_scale,\n\ float spatial_y_scale,\n\ - float in_width,\n\ - float in_height,\n\ + int in_width,\n\ + int in_height,\n\ float rcp_of_out_width,\n\ float rcp_of_out_height,\n\ float sampling_x_ratio,\n\ float sampling_y_ratio,\n\ - int depth\n\ + int depth,\n\ + int dtype\n\ )\n\ {\n\ int px = get_global_id(0);\n\ @@ -58374,7 +58584,10 @@ __kernel void roi_align_F32_F32toF32\n\ \n\ float2 spatial_indx = (float2)(px, py);\n\ float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height);\n\ - float2 max_spatial_dims = (float2)(in_width, in_height);\n\ + int4 max_spatial_dims = (int4)(in_width, in_height, in_width, in_height);\n\ + max_spatial_dims.zw = max_spatial_dims.zw - 1;\n\ +\n\ + float2 max_limiatation = convert_float2(max_spatial_dims.zw);\n\ \n\ float2 bin_size = roi_dims * pooled_dims;\n\ float2 region_start = spatial_indx * bin_size + roi_anchor.xy;\n\ @@ -58397,9 +58610,28 @@ __kernel void roi_align_F32_F32toF32\n\ bin_size,\n\ grid_size_xy,\n\ rcp_of_grid_size,\n\ - kz);\n\ + kz,\n\ + max_spatial_dims);\n\ \n\ - write_imagef(output, (int4)(px, py, kz1, 0), interp);\n\ + if (dtype == TYPE_FLOAT16)\n\ + {\n\ + half tmp;\n\ + short dst;\n\ + _viv_asm(CONV, tmp, interp.x);\n\ + _viv_asm(COPY, dst, tmp, 2);\n\ +\n\ + Tensor out_t = create_tensor_from_image2d_array(output, 2);\n\ + short *output_ptr = (short *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));\n\ +\n\ + output_ptr[0] = dst;\n\ + }\n\ + else\n\ + {\n\ + Tensor out_t = create_tensor_from_image2d_array(output, 4);\n\ + float *output_ptr = (float *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));\n\ +\n\ + output_ptr[0] = interp.x;\n\ + }\n\ }\n\ }\n\ \n\ @@ -58413,7 +58645,8 @@ inline float roi_align_1x1_U8toF32\n\ float2 bin_size,\n\ int2 grid_size,\n\ float2 rcp_of_grid_size,\n\ - int pz\n\ + int pz,\n\ + int4 max_spatial_dims\n\ )\n\ {\n\ float sum = 0;\n\ @@ -58424,33 +58657,43 @@ inline float roi_align_1x1_U8toF32\n\ {\n\ float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);\n\ float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;\n\ -\n\ + \n\ int2 xy_low = convert_int2(pos);\n\ int2 xy_high = xy_low + 1;\n\ -\n\ - float ly = pos.y - xy_low.y;\n\ - float lx = pos.x - xy_low.x;\n\ - float hy = 1.0f - ly;\n\ - float hx = 1.0f - lx;\n\ -\n\ + \n\ + float2 lxy = pos - floor(pos);\n\ + float2 zero = 0;\n\ + \n\ + if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||\n\ + xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )\n\ + {\n\ + continue;\n\ + }\n\ + \n\ + lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;\n\ + \n\ + float hy = 1.0f - lxy.y;\n\ + float hx = 1.0f - lxy.x;\n\ + \n\ float w1 = hy * hx;\n\ - float w2 = hy * lx;\n\ - float w3 = ly * hx;\n\ - float w4 = ly * lx;\n\ -\n\ + float w2 = lxy.x - lxy.x * lxy.y;\n\ + float w3 = lxy.y - lxy.x * lxy.y;\n\ + float w4 = lxy.y * lxy.x;\n\ + \n\ uint4 data;\n\ data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;\n\ data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;\n\ data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;\n\ data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;\n\ -\n\ + \n\ float4 value = convert_float4(data) * input_scale + input_tail;\n\ -\n\ + \n\ sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w;\n\ }\n\ }\n\ -\n\ + \n\ return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);\n\ +\n\ }\n\ \n\ __kernel void roi_align_U8_U16toU8\n\ @@ -58465,13 +58708,14 @@ __kernel void roi_align_U8_U16toU8\n\ float output_zp,\n\ float spatial_x_scale,\n\ float spatial_y_scale,\n\ - float in_width,\n\ - float in_height,\n\ + int in_width,\n\ + int in_height,\n\ float rcp_of_out_width,\n\ float rcp_of_out_height,\n\ float sampling_x_ratio,\n\ float sampling_y_ratio,\n\ - int depth\n\ + int depth,\n\ + int dtype\n\ )\n\ {\n\ int px = get_global_id(0);\n\ @@ -58490,7 +58734,10 @@ __kernel void roi_align_U8_U16toU8\n\ \n\ float2 spatial_indx = (float2)(px, py);\n\ float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height);\n\ - float2 max_spatial_dims = (float2)(in_width, in_height);\n\ + int4 max_spatial_dims = (int4)(in_width, in_height, in_width, in_height);\n\ + max_spatial_dims.zw = max_spatial_dims.zw - 1;\n\ +\n\ + float2 max_limiatation = convert_float2(max_spatial_dims.zw);\n\ \n\ float2 bin_size = roi_dims * pooled_dims;\n\ float2 region_start = spatial_indx * bin_size + roi_anchor.xy;\n\ @@ -58515,16 +58762,909 @@ __kernel void roi_align_U8_U16toU8\n\ bin_size,\n\ grid_size_xy,\n\ rcp_of_grid_size,\n\ - kz);\n\ + kz,\n\ + max_spatial_dims);\n\ \n\ - uint4 dst;\n\ + uchar dst;\n\ interp.x = interp.x * output_scale + output_zp;\n\ interp.x = interp.x < 255 ? interp.x : 255;\n\ - dst.x = convert_uint_rte(interp.x);\n\ - write_imageui(output, (int4)(px, py, kz1, 0), dst.xxxx);\n\ + dst = convert_uchar_rte(interp.x);\n\ +\n\ + Tensor out_t = create_tensor_from_image2d_array(output, 1);\n\ + uchar *output_ptr = (uchar *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));\n\ + \n\ + output_ptr[0] = dst;\n\ }\n\ }"; /* end of roi_align_cl*/ +static const char scatter_elements_cl[] = "\n\ +#define SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(name, dtype) \\\n\ +__kernel void scatter_elements_axis0_##name \\\n\ + ( \\\n\ + __read_only image2d_t ref, \\\n\ + __read_only image2d_t indices, \\\n\ + __read_only image2d_t update, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + \\\n\ + Image ref_i = create_image_from_image2d(ref, 4); \\\n\ + Image update_i = create_image_from_image2d(update, 4); \\\n\ + Image indices_i = create_image_from_image2d(indices, 4); \\\n\ + Image output_i = create_image_from_image2d(output, 4); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\ + dtype data = ref_ptr[0]; \\\n\ + if (coord.y < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\ + for(int x = 0; x < axis_size; x ++) \\\n\ + { \\\n\ + int offset = indices_ptr[x]; \\\n\ + if (offset == coord.x) \\\n\ + { \\\n\ + data = update_ptr[x]; \\\n\ + break; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)\n\ +SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)\n\ +\n\ +#define SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_axis0_##name \\\n\ + ( \\\n\ + __read_only image2d_t ref, \\\n\ + __read_only image2d_t indices, \\\n\ + __read_only image2d_t update, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + \\\n\ + Image ref_i = create_image_from_image2d(ref, 2); \\\n\ + Image update_i = create_image_from_image2d(update, 2); \\\n\ + Image indices_i = create_image_from_image2d(indices, 4); \\\n\ + Image output_i = create_image_from_image2d(output, 2); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.y < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\ + for(int x = 0; x < axis_size; x ++) \\\n\ + { \\\n\ + int offset = indices_ptr[x]; \\\n\ + if (offset == coord.x) \\\n\ + { \\\n\ + data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\ + break; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)\n\ +SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)\n\ +SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\ +\n\ +#define SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_axis0_##name \\\n\ + ( \\\n\ + __read_only image2d_t ref, \\\n\ + __read_only image2d_t indices, \\\n\ + __read_only image2d_t update, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + \\\n\ + Image ref_i = create_image_from_image2d(ref, 1); \\\n\ + Image update_i = create_image_from_image2d(update, 1); \\\n\ + Image indices_i = create_image_from_image2d(indices, 4); \\\n\ + Image output_i = create_image_from_image2d(output, 1); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.y < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\ + for(int x = 0; x < axis_size; x ++) \\\n\ + { \\\n\ + int offset = indices_ptr[x]; \\\n\ + if (offset == coord.x) \\\n\ + { \\\n\ + data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\ + break; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\ +SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)\n\ +\n\ +#define SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(name, dtype) \\\n\ +__kernel void scatter_elements_axis1_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t ref, \\\n\ + __read_only image2d_array_t indices, \\\n\ + __read_only image2d_array_t update, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \\\n\ + Tensor update_i = create_tensor_from_image2d_array(update, 4); \\\n\ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\ + Tensor output_i = create_tensor_from_image2d_array(output, 4); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\ + dtype data = ref_ptr[0]; \\\n\ + if (coord.x < inner_size && coord.z < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\ + for(int y = 0; y < axis_size; y ++) \\\n\ + { \\\n\ + int offset = indices_ptr[y * inner_size]; \\\n\ + if (offset == coord.y) \\\n\ + { \\\n\ + data = update_ptr[y * inner_size]; \\\n\ + break; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)\n\ +SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)\n\ +\n\ +#define SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_axis1_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t ref, \\\n\ + __read_only image2d_array_t indices, \\\n\ + __read_only image2d_array_t update, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \\\n\ + Tensor update_i = create_tensor_from_image2d_array(update, 2); \\\n\ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\ + Tensor output_i = create_tensor_from_image2d_array(output, 2); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.x < inner_size && coord.z < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\ + for(int y = 0; y < axis_size; y ++) \\\n\ + { \\\n\ + int offset = indices_ptr[y * inner_size]; \\\n\ + if (offset == coord.y) \\\n\ + { \\\n\ + data = conv_func(convert_float(update_ptr[y * inner_size]) \\\n\ + * update_scale + update_tail + output_zp); \\\n\ + break; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)\n\ +SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)\n\ +SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\ +\n\ +#define SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_axis1_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t ref, \\\n\ + __read_only image2d_array_t indices, \\\n\ + __read_only image2d_array_t update, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \\\n\ + Tensor update_i = create_tensor_from_image2d_array(update, 1); \\\n\ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\ + Tensor output_i = create_tensor_from_image2d_array(output, 1); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.x < inner_size && coord.z < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\ + for(int y = 0; y < axis_size; y ++) \\\n\ + { \\\n\ + int offset = indices_ptr[y * inner_size]; \\\n\ + if (offset == coord.y) \\\n\ + { \\\n\ + data = conv_func(convert_float(update_ptr[y * inner_size]) \\\n\ + * update_scale + update_tail + output_zp); \\\n\ + break; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\ +SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)\n\ +"; /* end of scatter_elements_cl*/ + +static const char scatter_elements_add_cl[] = "\n\ +#define SE_ADD_AXIS0_32BITS_IMPL(name, dtype) \\\n\ +__kernel void scatter_elements_add_axis0_##name \\\n\ + ( \\\n\ + __read_only image2d_t ref, \\\n\ + __read_only image2d_t indices, \\\n\ + __read_only image2d_t update, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + \\\n\ + Image ref_i = create_image_from_image2d(ref, 4); \\\n\ + Image update_i = create_image_from_image2d(update, 4); \\\n\ + Image indices_i = create_image_from_image2d(indices, 4); \\\n\ + Image output_i = create_image_from_image2d(output, 4); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\ + dtype data = ref_ptr[0]; \\\n\ + if (coord.y < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\ + for(int x = 0; x < axis_size; x ++) \\\n\ + { \\\n\ + int offset = indices_ptr[x]; \\\n\ + if (offset == coord.x) \\\n\ + { \\\n\ + data += update_ptr[x]; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_ADD_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)\n\ +SE_ADD_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)\n\ +\n\ +#define SE_ADD_AXIS0_16BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_add_axis0_##name \\\n\ + ( \\\n\ + __read_only image2d_t ref, \\\n\ + __read_only image2d_t indices, \\\n\ + __read_only image2d_t update, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + \\\n\ + Image ref_i = create_image_from_image2d(ref, 2); \\\n\ + Image update_i = create_image_from_image2d(update, 2); \\\n\ + Image indices_i = create_image_from_image2d(indices, 4); \\\n\ + Image output_i = create_image_from_image2d(output, 2); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.y < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\ + for(int x = 0; x < axis_size; x ++) \\\n\ + { \\\n\ + int offset = indices_ptr[x]; \\\n\ + if (offset == coord.x) \\\n\ + { \\\n\ + data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_ADD_AXIS0_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)\n\ +SE_ADD_AXIS0_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)\n\ +SE_ADD_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\ +\n\ +#define SE_ADD_AXIS0_8BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_add_axis0_##name \\\n\ + ( \\\n\ + __read_only image2d_t ref, \\\n\ + __read_only image2d_t indices, \\\n\ + __read_only image2d_t update, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + \\\n\ + Image ref_i = create_image_from_image2d(ref, 1); \\\n\ + Image update_i = create_image_from_image2d(update, 1); \\\n\ + Image indices_i = create_image_from_image2d(indices, 4); \\\n\ + Image output_i = create_image_from_image2d(output, 1); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.y < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\ + for(int x = 0; x < axis_size; x ++) \\\n\ + { \\\n\ + int offset = indices_ptr[x]; \\\n\ + if (offset == coord.x) \\\n\ + { \\\n\ + data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_ADD_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\ +SE_ADD_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)\n\ +\n\ +#define SE_ADD_AXIS1_32BITS_IMPL(name, dtype) \\\n\ +__kernel void scatter_elements_add_axis1_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t ref, \\\n\ + __read_only image2d_array_t indices, \\\n\ + __read_only image2d_array_t update, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \\\n\ + Tensor update_i = create_tensor_from_image2d_array(update, 4); \\\n\ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\ + Tensor output_i = create_tensor_from_image2d_array(output, 4); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\ + dtype data = ref_ptr[0]; \\\n\ + if (coord.x < inner_size && coord.z < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\ + for(int y = 0; y < axis_size; y ++) \\\n\ + { \\\n\ + int offset = indices_ptr[y * inner_size]; \\\n\ + if (offset == coord.y) \\\n\ + { \\\n\ + data += update_ptr[y * inner_size]; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_ADD_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)\n\ +SE_ADD_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)\n\ +\n\ +#define SE_ADD_AXIS1_16BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_add_axis1_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t ref, \\\n\ + __read_only image2d_array_t indices, \\\n\ + __read_only image2d_array_t update, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \\\n\ + Tensor update_i = create_tensor_from_image2d_array(update, 2); \\\n\ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\ + Tensor output_i = create_tensor_from_image2d_array(output, 2); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.x < inner_size && coord.z < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\ + for(int y = 0; y < axis_size; y ++) \\\n\ + { \\\n\ + int offset = indices_ptr[y * inner_size]; \\\n\ + if (offset == coord.y) \\\n\ + { \\\n\ + data += conv_func(convert_float(update_ptr[y * inner_size]) \\\n\ + * update_scale + update_tail + output_zp); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_ADD_AXIS1_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)\n\ +SE_ADD_AXIS1_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)\n\ +SE_ADD_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\ +\n\ +#define SE_ADD_AXIS1_8BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_add_axis1_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t ref, \\\n\ + __read_only image2d_array_t indices, \\\n\ + __read_only image2d_array_t update, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \\\n\ + Tensor update_i = create_tensor_from_image2d_array(update, 1); \\\n\ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\ + Tensor output_i = create_tensor_from_image2d_array(output, 1); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.x < inner_size && coord.z < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\ + for(int y = 0; y < axis_size; y ++) \\\n\ + { \\\n\ + int offset = indices_ptr[y * inner_size]; \\\n\ + if (offset == coord.y) \\\n\ + { \\\n\ + data += conv_func(convert_float(update_ptr[y * inner_size]) \\\n\ + * update_scale + update_tail + output_zp); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_ADD_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\ +SE_ADD_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)\n\ +"; /* end of scatter_elements_add_cl*/ + +static const char scatter_elements_mul_cl[] = "\n\ +#define SE_MUL_AXIS0_32BITS_IMPL(name, dtype) \\\n\ +__kernel void scatter_elements_mul_axis0_##name \\\n\ + ( \\\n\ + __read_only image2d_t ref, \\\n\ + __read_only image2d_t indices, \\\n\ + __read_only image2d_t update, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + \\\n\ + Image ref_i = create_image_from_image2d(ref, 4); \\\n\ + Image update_i = create_image_from_image2d(update, 4); \\\n\ + Image indices_i = create_image_from_image2d(indices, 4); \\\n\ + Image output_i = create_image_from_image2d(output, 4); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\ + dtype data = ref_ptr[0]; \\\n\ + if (coord.y < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\ + for(int x = 0; x < axis_size; x ++) \\\n\ + { \\\n\ + int offset = indices_ptr[x]; \\\n\ + if (offset == coord.x) \\\n\ + { \\\n\ + data *= update_ptr[x]; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_MUL_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)\n\ +SE_MUL_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)\n\ +\n\ +#define SE_MUL_AXIS0_16BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_mul_axis0_##name \\\n\ + ( \\\n\ + __read_only image2d_t ref, \\\n\ + __read_only image2d_t indices, \\\n\ + __read_only image2d_t update, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + \\\n\ + Image ref_i = create_image_from_image2d(ref, 2); \\\n\ + Image update_i = create_image_from_image2d(update, 2); \\\n\ + Image indices_i = create_image_from_image2d(indices, 4); \\\n\ + Image output_i = create_image_from_image2d(output, 2); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.y < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\ + for(int x = 0; x < axis_size; x ++) \\\n\ + { \\\n\ + int offset = indices_ptr[x]; \\\n\ + if (offset == coord.x) \\\n\ + { \\\n\ + data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_MUL_AXIS0_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)\n\ +SE_MUL_AXIS0_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)\n\ +SE_MUL_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\ +\n\ +#define SE_MUL_AXIS0_8BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_mul_axis0_##name \\\n\ + ( \\\n\ + __read_only image2d_t ref, \\\n\ + __read_only image2d_t indices, \\\n\ + __read_only image2d_t update, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + \\\n\ + Image ref_i = create_image_from_image2d(ref, 1); \\\n\ + Image update_i = create_image_from_image2d(update, 1); \\\n\ + Image indices_i = create_image_from_image2d(indices, 4); \\\n\ + Image output_i = create_image_from_image2d(output, 1); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\ + dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.y < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\ + int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\ + for(int x = 0; x < axis_size; x ++) \\\n\ + { \\\n\ + int offset = indices_ptr[x]; \\\n\ + if (offset == coord.x) \\\n\ + { \\\n\ + data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_MUL_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\ +SE_MUL_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)\n\ +\n\ +#define SE_MUL_AXIS1_32BITS_IMPL(name, dtype) \\\n\ +__kernel void scatter_elements_mul_axis1_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t ref, \\\n\ + __read_only image2d_array_t indices, \\\n\ + __read_only image2d_array_t update, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \\\n\ + Tensor update_i = create_tensor_from_image2d_array(update, 4); \\\n\ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\ + Tensor output_i = create_tensor_from_image2d_array(output, 4); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\ + dtype data = ref_ptr[0]; \\\n\ + if (coord.x < inner_size && coord.z < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\ + for(int y = 0; y < axis_size; y ++) \\\n\ + { \\\n\ + int offset = indices_ptr[y * inner_size]; \\\n\ + if (offset == coord.y) \\\n\ + { \\\n\ + data *= update_ptr[y * inner_size]; \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_MUL_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)\n\ +SE_MUL_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)\n\ +\n\ +#define SE_MUL_AXIS1_16BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_mul_axis1_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t ref, \\\n\ + __read_only image2d_array_t indices, \\\n\ + __read_only image2d_array_t update, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \\\n\ + Tensor update_i = create_tensor_from_image2d_array(update, 2); \\\n\ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\ + Tensor output_i = create_tensor_from_image2d_array(output, 2); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.x < inner_size && coord.z < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\ + for(int y = 0; y < axis_size; y ++) \\\n\ + { \\\n\ + int offset = indices_ptr[y * inner_size]; \\\n\ + if (offset == coord.y) \\\n\ + { \\\n\ + data *= conv_func(convert_float(update_ptr[y * inner_size]) \\\n\ + * update_scale + update_tail + output_zp); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_MUL_AXIS1_16BITS_IMPL(I16_I32_I16toI16, short, convert_short_rte)\n\ +SE_MUL_AXIS1_16BITS_IMPL(F16_I32_F16toF16, short, convert_short)\n\ +SE_MUL_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\ +\n\ +#define SE_MUL_AXIS1_8BITS_IMPL(name, dtype, conv_func) \\\n\ +__kernel void scatter_elements_mul_axis1_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t ref, \\\n\ + __read_only image2d_array_t indices, \\\n\ + __read_only image2d_array_t update, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int reduction, \\\n\ + float ref_scale, \\\n\ + float ref_tail, \\\n\ + float update_scale, \\\n\ + float update_tail, \\\n\ + float output_zp, \\\n\ + int inner_size, \\\n\ + int axis_size, \\\n\ + int outer_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \\\n\ + Tensor update_i = create_tensor_from_image2d_array(update, 1); \\\n\ + Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\ + Tensor output_i = create_tensor_from_image2d_array(output, 1); \\\n\ + \\\n\ + dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\ + dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\ + dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\ + if (coord.x < inner_size && coord.z < outer_size) \\\n\ + { \\\n\ + dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\ + int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\ + for(int y = 0; y < axis_size; y ++) \\\n\ + { \\\n\ + int offset = indices_ptr[y * inner_size]; \\\n\ + if (offset == coord.y) \\\n\ + { \\\n\ + data *= conv_func(convert_float(update_ptr[y * inner_size]) \\\n\ + * update_scale + update_tail + output_zp); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + output_ptr[0] = data; \\\n\ +}\n\ +SE_MUL_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\ +SE_MUL_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char, convert_char)\n\ +"; /* end of scatter_elements_mul_cl*/ + static const char scatter_nd_cl[] = "__kernel void scatter_nd_U32toU32_1D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ @@ -60287,6 +61427,7 @@ static const source_map_t evis_resource[] = {"argmin_axis2_vx", argmin_axis2_vx}, {"batchnorm_single_vx", batchnorm_single_vx}, {"batchnorm_single_f32_vx", batchnorm_single_f32_vx}, + {"bucketize_vx", bucketize_vx}, {"cast_vx", cast_vx}, {"clip_F16_vx", clip_F16_vx}, {"clip_I16_vx", clip_I16_vx}, @@ -60413,9 +61554,8 @@ static const source_map_t evis_resource[] = {"pre_process_gray_vx", pre_process_gray_vx}, {"pre_process_gray_2_vx", pre_process_gray_2_vx}, {"pre_process_gray_copy_vx", pre_process_gray_copy_vx}, + {"pre_process_nv12_copy_vx", pre_process_nv12_copy_vx}, {"pre_process_nv12_scale_vx", pre_process_nv12_scale_vx}, - {"pre_process_nv12_scale_8bits_vx", pre_process_nv12_scale_8bits_vx}, - {"pre_process_nv12_scale_mix_vx", pre_process_nv12_scale_mix_vx}, {"pre_process_rgb_vx", pre_process_rgb_vx}, {"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx}, {"pre_process_rgb888_planar_1_vx", pre_process_rgb888_planar_1_vx}, @@ -60424,11 +61564,11 @@ static const source_map_t evis_resource[] = {"pre_process_rgb888_planar_sep_1_vx", pre_process_rgb888_planar_sep_1_vx}, {"pre_process_rgb888_planar_sep_2_vx", pre_process_rgb888_planar_sep_2_vx}, {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx}, - {"pre_process_yuv420_copy_u8_vx", pre_process_yuv420_copy_u8_vx}, - {"pre_process_yuv420_scale_fp16_vx", pre_process_yuv420_scale_fp16_vx}, - {"pre_process_yuv420_scale_i16_vx", pre_process_yuv420_scale_i16_vx}, - {"pre_process_yuv420_scale_i8_vx", pre_process_yuv420_scale_i8_vx}, - {"pre_process_yuv420_scale_u8_vx", pre_process_yuv420_scale_u8_vx}, + {"pre_process_yuv420_copy_vx", pre_process_yuv420_copy_vx}, + {"pre_process_yuv420_scale_0_vx", pre_process_yuv420_scale_0_vx}, + {"pre_process_yuv420_scale_1_vx", pre_process_yuv420_scale_1_vx}, + {"pre_process_yuv422_copy_vx", pre_process_yuv422_copy_vx}, + {"pre_process_yuv422_scale_vx", pre_process_yuv422_scale_vx}, {"pre_process_yuv444_copy_u8_vx", pre_process_yuv444_copy_u8_vx}, {"pre_process_yuv444_scale_vx", pre_process_yuv444_scale_vx}, {"pre_process_yuv444_scale_fp16_vx", pre_process_yuv444_scale_fp16_vx}, @@ -60510,6 +61650,7 @@ static const source_map_t cl_resource[] = {"argmin_axis1_cl", argmin_axis1_cl}, {"argmin_axis2_cl", argmin_axis2_cl}, {"batchnorm_single_cl", batchnorm_single_cl}, + {"bucketize_cl", bucketize_cl}, {"cast_cl", cast_cl}, {"clip_BF16_cl", clip_BF16_cl}, {"clip_F32_cl", clip_F32_cl}, @@ -60549,6 +61690,7 @@ static const source_map_t cl_resource[] = {"log_softmax_axis2_cl", log_softmax_axis2_cl}, {"logical_not_cl", logical_not_cl}, {"logical_ops_cl", logical_ops_cl}, + {"lppool_cl", lppool_cl}, {"lstmunit_activation_BP_F32_cl", lstmunit_activation_BP_F32_cl}, {"lstmunit_activation_BP_U8_cl", lstmunit_activation_BP_U8_cl}, {"lstmunit_activation_B_F32_cl", lstmunit_activation_B_F32_cl}, @@ -60611,6 +61753,9 @@ static const source_map_t cl_resource[] = {"resize_bilinear_cl", resize_bilinear_cl}, {"resize_nearest_cl", resize_nearest_cl}, {"roi_align_cl", roi_align_cl}, + {"scatter_elements_cl", scatter_elements_cl}, + {"scatter_elements_add_cl", scatter_elements_add_cl}, + {"scatter_elements_mul_cl", scatter_elements_mul_cl}, {"scatter_nd_cl", scatter_nd_cl}, {"scatter_nd_update_cl", scatter_nd_update_cl}, {"select_cl", select_cl}, diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c index 69f987a..8462aad 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c @@ -478,7 +478,7 @@ vsi_status vsi_nn_ClientNodePassParameters ) { vsi_status status; - uint8_t i; + uint32_t i; status = VSI_FAILURE; for( i = 0; i < num; i++ ) diff --git a/src/tim/vx/internal/src/makefile.linux b/src/tim/vx/internal/src/makefile.linux index 45e11b8..e06adcf 100644 --- a/src/tim/vx/internal/src/makefile.linux +++ b/src/tim/vx/internal/src/makefile.linux @@ -1,8 +1,207 @@ +# to make ovxlib can compile both IDE and SKD +# if you want to use IDE to compile : export USE_IDE_LIB=1 +# and VIVANTE_SDK_DIR=..../VeriSilicon/VivanteIDE5.4.0/cmdtools/vsimulator + +################################################################################### +#common parts +# OBJECTS. + +OBJECTS = $(OBJ_DIR)/vsi_nn_context.o \ + $(OBJ_DIR)/vsi_nn_client_op.o \ + $(OBJ_DIR)/vsi_nn_graph.o \ + $(OBJ_DIR)/vsi_nn_node_attr_template.o \ + $(OBJ_DIR)/vsi_nn_node.o \ + $(OBJ_DIR)/vsi_nn_ops.o \ + $(OBJ_DIR)/vsi_nn_daemon.o \ + $(OBJ_DIR)/vsi_nn_tensor.o \ + $(OBJ_DIR)/vsi_nn_version.o \ + $(OBJ_DIR)/vsi_nn_rnn.o \ + $(OBJ_DIR)/vsi_nn_rnn_helper.o \ + $(OBJ_DIR)/vsi_nn_internal_node.o \ + $(OBJ_DIR)/vsi_nn_log.o \ + $(OBJ_DIR)/vsi_nn_graph_optimization.o \ + $(OBJ_DIR)/vsi_nn_pre_post_process.o + +vpath %.c utils +OBJECTS += $(OBJ_DIR)/vsi_nn_code_generator.o \ + $(OBJ_DIR)/vsi_nn_binary_tree.o \ + $(OBJ_DIR)/vsi_nn_map.o \ + $(OBJ_DIR)/vsi_nn_link_list.o \ + $(OBJ_DIR)/vsi_nn_math.o \ + $(OBJ_DIR)/vsi_nn_dtype_util.o \ + $(OBJ_DIR)/vsi_nn_shape_util.o \ + $(OBJ_DIR)/vsi_nn_dtype.o \ + $(OBJ_DIR)/vsi_nn_limits.o \ + $(OBJ_DIR)/vsi_nn_vdata.o \ + $(OBJ_DIR)/vsi_nn_util.o \ + $(OBJ_DIR)/vsi_nn_dlfcn.o \ + $(OBJ_DIR)/vsi_nn_constraint_check.o \ + $(OBJ_DIR)/vsi_nn_hashmap.o \ + $(OBJ_DIR)/vsi_nn_tensor_op.o + +vpath %.c quantization +OBJECTS += $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o \ + $(OBJ_DIR)/vsi_nn_asymmetric_affine.o \ + $(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o + +vpath %.c pycc +OBJECTS += $(OBJ_DIR)/vsi_pycc_interface.o + +vpath %.c post +OBJECTS += $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \ + $(OBJ_DIR)/vsi_nn_post_cmupose.o + +vpath %.c libnnext +OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \ + $(OBJ_DIR)/vsi_nn_vxkernel.o + +vpath %.c cpu_backend +SRCS += ${notdir ${wildcard cpu_backend/*.c}} + +vpath %.c libnnext/ops/kernel +SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}} + +vpath %.c ops +SRCS += ${notdir ${wildcard ops/*.c}} + +vpath %.c kernel +SRCS += ${notdir ${wildcard kernel/*.c}} + +vpath %.c kernel/cl +SRCS += ${notdir ${wildcard kernel/cl/*.c}} + +vpath %.c kernel/cpu +SRCS += ${notdir ${wildcard kernel/cpu/*.c}} + +vpath %.c kernel/evis +SRCS += ${notdir ${wildcard kernel/evis/*.c}} + +vpath %.c kernel/vx +SRCS += ${notdir ${wildcard kernel/vx/*.c}} + +vpath %.c kernel/sp +SRCS += ${notdir ${wildcard kernel/sp/*.c}} + +vpath %.c custom/ops +SRCS += ${notdir ${wildcard custom/ops/*.c}} + +vpath %.c custom/ops/kernel/evis +SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}} + +vpath %.c custom/ops/kernel/cl +SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}} + +vpath %.c custom/ops/kernel/cpu +SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}} + +vpath %.c custom/ops/kernel/sp +SRCS += ${notdir ${wildcard custom/ops/kernel/sp/*.c}} + +OBJECTS += ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)} + +ifeq ($(USE_VIP_DEVICE),1) +vpath %.cpp vip +OBJECTS += $(OBJ_DIR)/virtual_device.o +endif + +################################################################################ +ifeq ($(USE_IDE_LIB),1) +# IDE. + +CC=$(CROSS_COMPILE)gcc + +INCLUDES=-I. -I$(VIVANTE_SDK_DIR)/include/ \ + -I$(VIVANTE_SDK_DIR)/include/CL \ + -I$(VIVANTE_SDK_DIR)/include/VX \ + -I../include/ops -I../include/utils -I../include/inference \ + -I../include/client -I../include -I../include/libnnext \ + -I../include/cpu_backend + +ifeq (1,$(DEBUG)) +CFLAGS+=-g +LFLAGS+=-g +else +CFLAGS+=-O3 +LFLAGS+=-O3 +endif +CFLAGS += $(INCLUDES) +CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror -Wno-strict-aliasing -Wno-maybe-uninitialized +CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))' + +LIBS+= -L$(VIVANTE_SDK_DIR)/lib \ + -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -lArchModelSw -lNNArchPerf +LIBS+= -L$(VIVANTE_SDK_DIR)/lib/vsim \ + -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy +LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux \ + -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy +LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \ + -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy +LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \ + -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy +LIBS+= -L$(VIVANTE_SDK_DIR)/../common/lib/ \ + -lvdtproxy +LIBS += -lm -ldl + +File = $(VIVANTE_SDK_DIR)/lib/libjpeg.a +File2 = $(VIVANTE_SDK_DIR)/lib/x64_linux/libjpeg.a +File3 = $(VIVANTE_SDK_DIR)/../common/lib/libjpeg.a +ifeq ($(File),$(wildcard $(File))) +LIBS+= $(File) +else ifeq ($(File2),$(wildcard $(File2))) +LIBS+= $(File2) +else +LIBS+= $(File3) +endif + +################################################################################### +# Macros. +CFLAGS += -fPIC +DYNAMIC := 1 +TARGET_NAME = libovxlib.so +OBJ_DIR=bin_r +TARGET_OUTPUT = $(OBJ_DIR)/$(TARGET_NAME) + +all: $(TARGET_OUTPUT) +clean: + @rm -rf $(OBJ_DIR)/* $(OBJ_DIR) + +install: $(TARGET_OUTPUT) + +################################################################################ + +LDFLAGS += -Wall -shared -Wl,-soname,$(TARGET_NAME) -Wl,-z,defs -fPIC + +ifeq ($(USE_VIP_DEVICE),1) +LDFLAGS += -pthread +LIBS += -lstdc++ +INCLUDE += -I../include/vip +$(OBJ_DIR)/virtual_device.o: virtual_device.cpp + @echo " COMPILE $(abspath $<)" + @mkdir -p $(OBJ_DIR) + @$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $< +endif + +$(TARGET_OUTPUT): $(OBJECTS) + @echo " LINK \033[1m$(notdir $@)\033[0m" + @$(CC) $(LDFLAGS) $(OBJECTS) -o $(TARGET_OUTPUT) $(LIBS) + +$(OBJ_DIR)/%.o: %.c + @echo " COMPILE $(abspath $<)" + @mkdir -p $(OBJ_DIR) + @$(CC) -c $(CFLAGS) -o $@ $< + +else +################################################################################## +#SDK. + +# include common definition. include $(AQROOT)/makefile.linux.def +################################################################################# INCLUDE += -I$(VIVANTE_SDK_INC) -I$(VIVANTE_SDK_INC)/HAL -I$(AQROOT)/sdk/inc INCLUDE += -I../include/ops -I../include/utils -I../include/inference INCLUDE += -I../include/client -I../include -I../include/libnnext +INCLUDE += -I../include/cpu_backend CFLAGS += $(INCLUDE) CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror @@ -43,89 +242,6 @@ ifneq ($(gcdSTATIC_LINK), 1) endif endif ############################################################################# -# Objects. -OBJECTS = $(OBJ_DIR)/vsi_nn_context.o \ - $(OBJ_DIR)/vsi_nn_client_op.o \ - $(OBJ_DIR)/vsi_nn_graph.o \ - $(OBJ_DIR)/vsi_nn_node_attr_template.o \ - $(OBJ_DIR)/vsi_nn_node.o \ - $(OBJ_DIR)/vsi_nn_ops.o \ - $(OBJ_DIR)/vsi_nn_daemon.o \ - $(OBJ_DIR)/vsi_nn_tensor.o \ - $(OBJ_DIR)/vsi_nn_version.o \ - $(OBJ_DIR)/vsi_nn_rnn.o \ - $(OBJ_DIR)/vsi_nn_rnn_helper.o \ - $(OBJ_DIR)/vsi_nn_internal_node.o \ - $(OBJ_DIR)/vsi_nn_log.o \ - $(OBJ_DIR)/vsi_nn_graph_optimization.o \ - $(OBJ_DIR)/vsi_nn_pre_post_process.o - -vpath %.c utils -OBJECTS += $(OBJ_DIR)/vsi_nn_code_generator.o \ - $(OBJ_DIR)/vsi_nn_binary_tree.o \ - $(OBJ_DIR)/vsi_nn_map.o \ - $(OBJ_DIR)/vsi_nn_link_list.o \ - $(OBJ_DIR)/vsi_nn_math.o \ - $(OBJ_DIR)/vsi_nn_dtype_util.o \ - $(OBJ_DIR)/vsi_nn_shape_util.o \ - $(OBJ_DIR)/vsi_nn_dtype.o \ - $(OBJ_DIR)/vsi_nn_limits.o \ - $(OBJ_DIR)/vsi_nn_vdata.o \ - $(OBJ_DIR)/vsi_nn_util.o \ - $(OBJ_DIR)/vsi_nn_constraint_check.o \ - $(OBJ_DIR)/vsi_nn_hashmap.o \ - $(OBJ_DIR)/vsi_nn_tensor_op.o - -vpath %.c quantization -OBJECTS += $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o \ - $(OBJ_DIR)/vsi_nn_asymmetric_affine.o \ - $(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o - -vpath %.c pycc -OBJECTS += $(OBJ_DIR)/vsi_pycc_interface.o - -vpath %.c post -OBJECTS += $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \ - $(OBJ_DIR)/vsi_nn_post_cmupose.o - -vpath %.c libnnext -OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \ - $(OBJ_DIR)/vsi_nn_vxkernel.o - -vpath %.c libnnext/ops/kernel -SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}} - -vpath %.c ops -SRCS += ${notdir ${wildcard ops/*.c}} - -vpath %.c kernel -SRCS += ${notdir ${wildcard kernel/*.c}} - -vpath %.c kernel/cl -SRCS += ${notdir ${wildcard kernel/cl/*.c}} - -vpath %.c kernel/cpu -SRCS += ${notdir ${wildcard kernel/cpu/*.c}} - -vpath %.c kernel/evis -SRCS += ${notdir ${wildcard kernel/evis/*.c}} - -vpath %.c kernel/vx -SRCS += ${notdir ${wildcard kernel/vx/*.c}} - -vpath %.c custom/ops -SRCS += ${notdir ${wildcard custom/ops/*.c}} - -vpath %.c custom/ops/kernel/evis -SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}} - -vpath %.c custom/ops/kernel/cl -SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}} - -vpath %.c custom/ops/kernel/cpu -SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}} - -OBJECTS += ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)} # installation directory INSTALL_DIR := $(VIVANTE_SDK_LIB) @@ -133,4 +249,15 @@ INSTALL_DIR := $(VIVANTE_SDK_LIB) ################################################################################ # Include the common makefile. +ifeq ($(USE_VIP_DEVICE),1) +LDFLAGS += -pthread +LIBS += -lstdc++ +INCLUDE += -I../include/vip +$(OBJ_DIR)/virtual_device.o: virtual_device.cpp + @echo " COMPILE $(abspath $<)" + @mkdir -p $(OBJ_DIR) + @$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $< +endif + include $(AQROOT)/common.target +endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c index 24a3d14..fba4d05 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c @@ -34,7 +34,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" -#include "vsi_nn_test.h" +#include "vsi_nn_error.h" #include "utils/vsi_nn_constraint_check.h" static vsi_status op_compute @@ -48,9 +48,39 @@ static vsi_status op_compute vx_nn_reorg_params_ext_t param; vsi_nn_tensor_t *block_size_tensor = NULL; vsi_nn_tensor_t *pad_tensor = NULL; + vsi_nn_tensor_t *input_tensor = NULL; + vsi_nn_tensor_t *output_tensor = NULL; vsi_nn_tensor_attr_t attr; - memset(¶m, 0, sizeof(vx_nn_reorg_params_ext_t)); + int32_t block_size[2] = {1, 1}; + vsi_bool need_release_tensor = TRUE; + block_size[0] = self->nn_param.batch2space.block_size[0]; + if (vsi_nn_is_3d_tensor(inputs[0])) + { + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{1}}; + memcpy(shape[0], inputs[0]->attr.size, sizeof(shape[0])); + memcpy(shape[1], outputs[0]->attr.size, sizeof(shape[1])); + shape[0][3] = shape[0][2]; + shape[0][2] = shape[0][1]; + shape[0][1] = 1; + shape[1][3] = shape[1][2]; + shape[1][2] = shape[1][1]; + shape[1][1] = 1; + + input_tensor = vsi_nn_reshape_tensor(self->graph, inputs[0], shape[0], 4); + CHECK_PTR_FAIL_GOTO( input_tensor, "craete tensor fail.", final ); + output_tensor = vsi_nn_reshape_tensor(self->graph, outputs[0], shape[1], 4); + CHECK_PTR_FAIL_GOTO( output_tensor, "craete tensor fail.", final ); + } + else + { + block_size[1] = self->nn_param.batch2space.block_size[1]; + need_release_tensor = FALSE; + input_tensor = inputs[0]; + output_tensor = outputs[0]; + } + + memset(¶m, 0, sizeof(vx_nn_reorg_params_ext_t)); memset(&attr, 0, sizeof(attr)); attr.size[0] = 2; attr.dim_num = 1; @@ -59,9 +89,9 @@ static vsi_status op_compute attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; block_size_tensor = vsi_nn_CreateTensorFromData( self->graph, - (uint8_t *)self->nn_param.batch2space.block_size, + (uint8_t *)block_size, &attr); - TEST_CHECK_PTR(block_size_tensor, final); + CHECK_PTR_FAIL_GOTO( block_size_tensor, "craete tensor fail.", final ); memset(&attr, 0, sizeof(attr)); attr.size[0] = 4; @@ -73,16 +103,16 @@ static vsi_status op_compute self->graph, (uint8_t *)self->nn_param.batch2space.crop, &attr); - TEST_CHECK_PTR(pad_tensor, final); + CHECK_PTR_FAIL_GOTO( pad_tensor, "craete tensor fail.", final ); param.base.block_size = REQUIRED_IO(block_size_tensor); param.pad = OPTIONAL_IO(pad_tensor); param.base.type = VX_REORG_BATCH_TO_SPACE_ND; self->n = vxReorgLayer2( self->graph->g, - inputs[0]->t, + input_tensor->t, (vx_nn_reorg_params_t *)¶m, sizeof(vx_nn_reorg_params_ext_t), - outputs[0]->t); + output_tensor->t); if( NULL != self->n ) { @@ -90,8 +120,13 @@ static vsi_status op_compute } final: - if (block_size_tensor) vsi_nn_ReleaseTensor(&block_size_tensor); - if (pad_tensor) vsi_nn_ReleaseTensor(&pad_tensor); + if (need_release_tensor) + { + vsi_safe_release_tensor(input_tensor); + vsi_safe_release_tensor(output_tensor); + } + vsi_safe_release_tensor(block_size_tensor); + vsi_safe_release_tensor(pad_tensor); return status; } /* op_compute() */ @@ -105,14 +140,13 @@ static vsi_bool op_check { vsi_bool ret = FALSE; - if (inputs[0]->attr.dim_num != 4) + if (inputs[0]->attr.dim_num < 3) { - VSILOGE("batch2space only support 4D"); + VSILOGE("The input tensor shape must be 3D or 4D!"); return FALSE; } - if (self->nn_param.batch2space.block_size[0] < 0 - || self->nn_param.batch2space.block_size[1] < 0) + if (self->nn_param.batch2space.block_size[0] < 0) { VSILOGE("Block size can't be less than zero in batch to space"); return FALSE; @@ -131,18 +165,33 @@ static vsi_bool op_setup ) { vsi_nn_batch2space_param * p; + p = (vsi_nn_batch2space_param *)&(self->nn_param.batch2space); - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - outputs[0]->attr.size[3] = - inputs[0]->attr.size[3] / p->block_size[0] / p->block_size[1]; - outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; - outputs[0]->attr.size[1] = - inputs[0]->attr.size[1] * p->block_size[1] - p->crop[2] - p->crop[3]; - outputs[0]->attr.size[0] = - inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1]; - outputs[0]->attr.dim_num = 4; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + + if (vsi_nn_is_3d_tensor(inputs[0])) + { + outputs[0]->attr.size[2] = + inputs[0]->attr.size[2] / p->block_size[0]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[0]->attr.size[0] = + inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1]; + } + else + { + outputs[0]->attr.size[3] = + inputs[0]->attr.size[3] / p->block_size[0] / p->block_size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[1] = + inputs[0]->attr.size[1] * p->block_size[1] - p->crop[2] - p->crop[3]; + outputs[0]->attr.size[0] = + inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1]; + } + + } return TRUE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c index 8f81613..da6af26 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c @@ -53,6 +53,7 @@ static vsi_bool setup_op_shapes vsi_size_t num_units = 0; vsi_size_t output_size = 0; vsi_size_t batch_size = 0; + vsi_bool use_virtual_tensor = TRUE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); if( curr_param->time_major ) @@ -94,6 +95,28 @@ static vsi_bool setup_op_shapes inputs[BI_RNN_BW_INPUT_H_STATE] = output_tensor->t; } + if( !outputs[BI_RNN_FW_OUTPUT_H_STATE] ) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + memcpy( &attr.dtype, &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[BI_RNN_FW_OUTPUT_H_STATE] = output_tensor->t; + } + + if( !outputs[BI_RNN_BW_OUTPUT_H_STATE] ) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + memcpy( &attr.dtype, &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[BI_RNN_BW_OUTPUT_H_STATE] = output_tensor->t; + } + /* output */ if( VSI_NN_DIM_AUTO == outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dim_num ) { @@ -118,6 +141,26 @@ static vsi_bool setup_op_shapes } } + /* output_state_out */ + if(VSI_NN_DIM_AUTO == outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dim_num) + { + if( curr_param->merge_outputs ) + { + outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.size[0] = output_size*2; + outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.size[1] = batch_size; + outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dim_num = 2; + } + else + { + outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.size[0] = output_size; + outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.size[1] = batch_size; + outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dim_num = 2; + + outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.size[0] = output_size; + outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.size[1] = batch_size; + outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.dim_num = 2; + } + } return TRUE; } @@ -292,10 +335,36 @@ static vsi_bool op_setup /* rnncell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, - &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + &outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); rnncell_out1 = output_tensor->t; + if (reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + inputs[BI_RNN_BW_INPUT_WEIGHT_I]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].qnt_type == VSI_NN_QNT_TYPE_NONE && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].vx_type == VSI_NN_TYPE_NONE) + { + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].vx_type = VSI_NN_TYPE_FLOAT32; + } + + if (last_step_h_state_fw && + last_step_h_state_fw->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + inputs[BI_RNN_BW_INPUT_WEIGHT_H]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].qnt_type == VSI_NN_QNT_TYPE_NONE && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].vx_type == VSI_NN_TYPE_NONE) + { + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].vx_type = VSI_NN_TYPE_FLOAT32; + } + + if (has_aux_input&& + aux_reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + inputs[BI_RNN_BW_AUX_INPUT_WEIGHT]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].qnt_type == VSI_NN_QNT_TYPE_NONE && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].vx_type == VSI_NN_TYPE_NONE) + { + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].vx_type = VSI_NN_TYPE_FLOAT32; + } + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 ); curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation; memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype, @@ -307,8 +376,8 @@ static vsi_bool op_setup curr->inputs[RNNCELL_INPUT_WEIGHT_I] = inputs[BI_RNN_FW_INPUT_WEIGHT_I]; curr->inputs[RNNCELL_INPUT_WEIGHT_H] = inputs[BI_RNN_FW_INPUT_WEIGHT_H]; - curr->inputs[RNNCELL_INPUT_BIAS] = inputs[BI_RNN_FW_INPUT_BIAS]; - + curr->inputs[RNNCELL_INPUT_BIAS_I] = inputs[BI_RNN_FW_INPUT_BIAS_I]; + curr->inputs[RNNCELL_INPUT_BIAS_H] = inputs[BI_RNN_FW_INPUT_BIAS_H]; if (has_aux_input) { curr->inputs[RNNCELL_INPUT_AUX_INPUT] = aux_reshape_output_tensors[i]; @@ -348,23 +417,49 @@ static vsi_bool op_setup /* rnncell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, - &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + &outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); rnncell_out1 = output_tensor->t; + if (reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + inputs[BI_RNN_BW_INPUT_WEIGHT_I]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].qnt_type == VSI_NN_QNT_TYPE_NONE && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].vx_type == VSI_NN_TYPE_NONE) + { + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].vx_type = VSI_NN_TYPE_FLOAT32; + } + + if (last_step_h_state_bw && + last_step_h_state_bw->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + inputs[BI_RNN_BW_INPUT_WEIGHT_H]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].qnt_type == VSI_NN_QNT_TYPE_NONE && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].vx_type == VSI_NN_TYPE_NONE) + { + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].vx_type = VSI_NN_TYPE_FLOAT32; + } + + if (has_aux_input&& + aux_reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + inputs[BI_RNN_BW_AUX_INPUT_WEIGHT]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].qnt_type == VSI_NN_QNT_TYPE_NONE && + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].vx_type == VSI_NN_TYPE_NONE) + { + curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].vx_type = VSI_NN_TYPE_FLOAT32; + } + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 ); curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation; memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype, - &(curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_COUNT]), - sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT); + curr_param->internal_dtype, + sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT); curr->inputs[RNNCELL_INPUT_INPUT] = reshape_output_tensors[time_step - 1 - i]; curr->inputs[RNNCELL_INPUT_H_STATE] = last_step_h_state_bw; curr->inputs[RNNCELL_INPUT_WEIGHT_I] = inputs[BI_RNN_BW_INPUT_WEIGHT_I]; curr->inputs[RNNCELL_INPUT_WEIGHT_H] = inputs[BI_RNN_BW_INPUT_WEIGHT_H]; - curr->inputs[RNNCELL_INPUT_BIAS] = inputs[BI_RNN_BW_INPUT_BIAS]; - + curr->inputs[RNNCELL_INPUT_BIAS_I] = inputs[BI_RNN_BW_INPUT_BIAS_I]; + curr->inputs[RNNCELL_INPUT_BIAS_H] = inputs[BI_RNN_BW_INPUT_BIAS_H]; if(has_aux_input) { curr->inputs[RNNCELL_INPUT_AUX_INPUT] = aux_reshape_output_tensors[time_step - 1 - i]; @@ -454,6 +549,15 @@ static vsi_bool op_setup tensor = output_tensor->t; } + /* forward output state*/ + if (outputs[BI_RNN_FW_OUTPUT_H_STATE] != NULL) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = last_step_h_state_fw; + curr->outputs[0] = outputs[BI_RNN_FW_OUTPUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + } + /* concat rnncell output, the rnn's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); curr->node->nn_param.concat.axis = 2; @@ -482,6 +586,15 @@ static vsi_bool op_setup tensor = output_tensor->t; } + /* backward output state*/ + if (outputs[BI_RNN_BW_OUTPUT_H_STATE] != NULL) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = last_step_h_state_bw; + curr->outputs[0] = outputs[BI_RNN_BW_OUTPUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + } + /* concat rnncell output, the rnn's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); curr->node->nn_param.concat.axis = 2; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c new file mode 100644 index 0000000..cac99d0 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c @@ -0,0 +1,208 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _bucketize_local_data_t { + int32_t placeholder; +} bucketize_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t right = self->nn_param.bucketize.right; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; + vsi_bool ret = TRUE; + vsi_nn_kernel_param_t * param = NULL; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "right", right ); + + ret = vsi_nn_kernel_optimize_element_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + + if ( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], shape, new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[0], shape, new_rank ); + shape[0] = inputs[1]->attr.size[0]; + shape[1] = 1; + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], shape, 2 ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "bucketize", + &reshape_tensors[0], 2, + &reshape_tensors[2], 1, param ); + + vsi_safe_release_tensor( reshape_tensors[0] ); + vsi_safe_release_tensor( reshape_tensors[1] ); + vsi_safe_release_tensor( reshape_tensors[2] ); + } + + if ( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(BUCKETIZE, 2, 1) + IO_TYPE(D_U32, D_U32, D_I32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_F32, D_F32, D_I32) + IO_TYPE(D_F16, D_F16, D_I32) + IO_TYPE(D_BF16, D_BF16, D_I32) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_I32) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32) + END_IO_TYPE_DECL(BUCKETIZE) + if (!VALIDATE_OP_IO_TYPES(BUCKETIZE, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i, out_rank; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = TRUE; + + out_rank = inputs[0]->attr.dim_num; + + for (i = 0; i < out_rank; i++) + { + shape[i] = inputs[0]->attr.size[i]; + } + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); + } + else + { + vsi_size_t total_size_got; + vsi_size_t total_size_expected; + total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); + total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, + outputs[0]->attr.dim_num ); + if ( total_size_expected != total_size_got ) + { + VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", + total_size_expected, total_size_got); + ret = FALSE; + } + } + + return ret; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + self->nn_param.bucketize.right = FALSE; + + return VSI_SUCCESS; +} /* op_init() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ BUCKETIZE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c index b2b01f5..4f55660 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c @@ -35,11 +35,11 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "vsi_nn_internal_node.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" #include "utils/vsi_nn_constraint_check.h" +#include "utils/vsi_nn_dtype_util.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) @@ -65,14 +65,24 @@ static vsi_status op_compute vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_size_t new_rank = 0; - vsi_bool ret; + vsi_bool ret = TRUE; vsi_nn_kernel_param_t * param = NULL; param = vsi_nn_kernel_param_create(); - ret = vsi_nn_kernel_optimize_element_shape( - inputs[0]->attr.size, inputs[0]->attr.dim_num, - shape, &new_rank ); + if ( vsi_nn_TypeGetBits(inputs[0]->attr.dtype.vx_type) == 4 || + vsi_nn_TypeGetBits(outputs[0]->attr.dtype.vx_type) == 4 ) + { + new_rank = inputs[0]->attr.dim_num; + memcpy(shape, inputs[0]->attr.size, sizeof(inputs[0]->attr.size)); + } + else + { + ret = vsi_nn_kernel_optimize_element_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + } + vsi_nn_kernel_param_add_float32( param, "min_value", min_value ); vsi_nn_kernel_param_add_float32( param, "max_value", max_value ); @@ -154,8 +164,11 @@ static vsi_bool op_check /* HW 9.1.1 */ IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I4|Q_SYM) IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_U4|Q_ASYM) IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM) END_IO_TYPE_DECL(CLIP) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c index 5ebe3cf..3b2cf21 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c @@ -119,6 +119,7 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); vsi_nn_kernel_param_add_int32( param, "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); + vsi_nn_kernel_param_add_int32( param, "pad_mode", vsi_nn_get_vx_pad_mode( self->nn_param.conv1d.pad_mode ) ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv1d_ovxlib", new_inputs, 3, outputs, 1, param ); @@ -136,6 +137,7 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); vsi_nn_kernel_param_add_int32( param, "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); + vsi_nn_kernel_param_add_int32( param, "pad_mode", vsi_nn_get_vx_pad_mode( self->nn_param.conv1d.pad_mode ) ); if( self->nn_param.conv1d.multiplier > 0 ) { vsi_nn_kernel_param_add_int32( param, "multiplier", diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c index ba50ffd..228b586 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c @@ -57,6 +57,7 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); vsi_nn_kernel_param_add_int32( param, "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); + vsi_nn_kernel_param_add_int32( param, "pad_mode", vsi_nn_get_vx_pad_mode( self->nn_param.conv2d.pad_mode ) ); if (self->nn_param.conv2d.multiplier != 0) { vsi_nn_kernel_param_add_int32( param, "multiplier", self->nn_param.conv2d.multiplier ); @@ -87,318 +88,95 @@ static vsi_bool op_check /* Check fl and scale*/ ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); - if(ret) { + if (ret) { + vsi_size_t kx = 1; + vsi_size_t ky = 1; /* check inputs outputs data type */ - BEGIN_IO_TYPE_DECL(CONV2D, 3, 1) - /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */ - IO_TYPE(D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F32, D_F16) + BEGIN_IO_TYPE_DECL(CONV2D, 2, 0) + /* IO_TYPE(INPUT, WEIGHT) */ + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_F16) - - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) - - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_ASYM, D_U8|Q_ASYM) - - IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) - IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) - - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I8|Q_SYM) - - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - - /* IO_TYPE(INPUT, WEIGHT, NULL, OUTPUT) */ - IO_TYPE(D_F32, D_F32, D_NONE, D_F32) - - IO_TYPE(D_F16, D_F16, D_NONE, D_F16) - - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F16) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) - - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16) - - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) - - IO_TYPE(D_BF16, D_BF16, D_NONE, D_F32) - IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) - - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_NONE, D_I8|Q_SYM) - - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC) /* HW 9.0 */ - IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) - IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + IO_TYPE(D_F32, D_BF16) /* HW 9.0.1 */ - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) - - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F32) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) - - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_I16|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_F32) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_BF16) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_F32) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_I16|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_BF16) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_F32) - - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_F16) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_F32) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM, D_NONE, D_I16|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM, D_NONE, D_F32) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_I8|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_I16|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_F16) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_BF16) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_F32) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I8|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I16|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_BF16) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_F32) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_I8|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_I16|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_BF16) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_F32) - - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_F16) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_SYM) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_F32) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_F32) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F32) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - IO_TYPE(D_F16, D_F16, D_NONE, D_BF16) - IO_TYPE(D_F16, D_F16, D_NONE, D_F32) - IO_TYPE(D_F16, D_F16, D_F32, D_BF16) - IO_TYPE(D_F16, D_F16, D_F32, D_F32) - - IO_TYPE(D_BF16, D_BF16, D_NONE, D_F16) - IO_TYPE(D_BF16, D_BF16, D_F32, D_F16) - - IO_TYPE(D_F32, D_BF16, D_NONE, D_F16) - IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) - IO_TYPE(D_F32, D_BF16, D_NONE, D_F32) - IO_TYPE(D_F32, D_BF16, D_F32, D_F16) - IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) - IO_TYPE(D_F32, D_BF16, D_F32, D_F32) - - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM) /* HW 9.1.1 */ - IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) - IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) - IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) - IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) - IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_ASYM, D_U4|Q_ASYM) - IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_ASYM, D_I4|Q_ASYM) - IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) - IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) - IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) - IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) - IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I4|Q_ASYM) - IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U4|Q_ASYM) - IO_TYPE(D_I4|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I4|Q_DFP) + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC) + IO_TYPE(D_I4|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_U4|Q_ASYM) - IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_I4|Q_ASYM) - IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U4|Q_ASYM) - IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I4|Q_SYM) - IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM, D_I4|Q_SYM) - IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I4|Q_SYM) - IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U4|Q_ASYM) - IO_TYPE(D_I4|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_SYM) - IO_TYPE(D_I4|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) - IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I4|Q_SYM) - IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM_PC) + + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_I8|Q_ASYM) + + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_U8|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_U8|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM) + + IO_TYPE(D_I16|Q_ASYM, D_I8|Q_SYM_PC) + IO_TYPE(D_I16|Q_ASYM, D_U8|Q_SYM_PC) + IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM_PC) + IO_TYPE(D_I16|Q_SYM, D_U8|Q_SYM_PC) END_IO_TYPE_DECL(CONV2D) - ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, self->input.num, outputs, self->output.num); - if(!ret) { + ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, 2, outputs, 0); + if (!ret) { char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); + 2, outputs, 0); VSILOGE("Inputs/Outputs data type not support: %s", desc); destroy_op_io_types_desc(desc); return FALSE; } /* check parameters */ - if(inputs[1]->attr.size[0] * inputs[1]->attr.size[1] > 6400) { + kx = inputs[1]->attr.size[0]; + ky = inputs[1]->attr.dim_num == 3 ? 1 : inputs[1]->attr.size[1]; + if (kx * ky > 6400) { VSILOGE("Kernel size should <= 6400."); return FALSE; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c index 9ab2266..5af07e2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c @@ -106,6 +106,7 @@ static vsi_nn_internal_tensor_t * create_input_conv input_conv->node->vx_param.overflow_policy = self->vx_param.overflow_policy; input_conv->node->vx_param.rounding_policy = self->vx_param.rounding_policy; input_conv->node->vx_param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding; + input_conv->node->nn_param.conv2d.pad_mode = p->conv2d.pad_mode; input_conv->inputs[0] = input; input_conv->inputs[1] = weight; @@ -167,6 +168,7 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv recurrent_conv->node->vx_param.overflow_policy = self->vx_param.overflow_policy; recurrent_conv->node->vx_param.rounding_policy = self->vx_param.rounding_policy; recurrent_conv->node->vx_param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding; + recurrent_conv->node->nn_param.conv2d.pad_mode = p->conv2d.pad_mode; recurrent_conv->inputs[0] = input; recurrent_conv->inputs[1] = weight; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c index 063dbd0..2b470b1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c @@ -77,6 +77,7 @@ static vsi_status op_compute MAP_PARAM("overflow_policy",self->vx_param.overflow_policy); MAP_PARAM("rounding_policy",self->vx_param.rounding_policy); MAP_PARAM("down_scale_size_rounding",self->vx_param.down_scale_size_rounding); + MAP_PARAM("pad_mode", vsi_nn_get_vx_pad_mode( self->nn_param.conv3d.pad_mode ) ); if ( self->nn_param.conv3d.dilation[0] * self->nn_param.conv3d.dilation[1] * diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index e18c4bd..8d8ff5f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -92,6 +92,11 @@ static vsi_status op_optimize status = VSI_SUCCESS; + if( !self->graph->ctx->options.enable_dataconvert_optimize ) + { + return status; + } + if ( _is_same_quant(self, inputs, outputs) == FALSE || (inputs[0]->t != NULL && outputs[0]->t != NULL)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c index 0692666..4128480 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c @@ -35,6 +35,7 @@ #include "vsi_nn_log.h" #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_error.h" #define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\ (( in - 1 ) * stride + ksize - pad_1 - pad_2 + output_padding) @@ -66,20 +67,18 @@ static vsi_status op_compute if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) { weight_tensor = vsi_nn_reshape_tensor( self->graph, inputs[1], weight_attr.size, 4 ); + CHECK_PTR_FAIL_GOTO( weight_tensor, "create tensor fail.", final ); } else { uint8_t * data = NULL; data = vsi_nn_ConvertTensorToData( self->graph, inputs[1] ); - if (NULL == data) - { - VSILOGE("Convert data fail.\n"); - status = VSI_FAILURE; - return status; - } + CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final ); + weight_attr.dtype.channel_dim = inputs[1]->attr.dtype.channel_dim + 1; weight_tensor = vsi_nn_CreateTensorFromData(self->graph, data, &weight_attr); vsi_nn_safe_free( data ); + CHECK_PTR_FAIL_GOTO( weight_tensor, "create tensor fail.", final ); } #ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c index 2a9f688..498c869 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c @@ -68,6 +68,8 @@ static vsi_status op_compute self->vx_param.rounding_policy ); vsi_nn_kernel_param_add_int32( param, "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); + vsi_nn_kernel_param_add_int32( param, "pad_mode", + vsi_nn_get_vx_pad_mode( self->nn_param.depthwise_conv1d.pad_mode ) ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "depthwise_conv1d", inputs, 3, outputs, 1, param ); if( self->n ) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c index 73ba406..aea2f63 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -38,6 +38,14 @@ #include "kernel/vsi_nn_kernel_eltwise.h" #include "utils/vsi_nn_constraint_check.h" +vsi_bool vsi_nn_kernel_is_supported_types + ( + vsi_nn_tensor_t** inputs, + size_t input_num, + vsi_nn_tensor_t** outputs, + size_t output_num + ); + static vsi_status _eltwise_op_compute ( const char * kernel_name, @@ -54,8 +62,9 @@ static vsi_status _eltwise_op_compute vx_bool doShapeOptimized = TRUE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_context_t ctx = NULL; + vsi_bool is_executed_on_sh = FALSE; - if( NULL == self ) + if ( NULL == self ) { return VSI_FAILURE; } @@ -63,11 +72,15 @@ static vsi_status _eltwise_op_compute ctx = self->graph->ctx; + is_executed_on_sh = vsi_nn_kernel_is_supported_types(inputs, 2, outputs, 1) && + !ctx->config.support_stream_processor; + if ( strcmp(kernel_name, "sub") == 0 || strcmp(kernel_name, "add") == 0 || strcmp(kernel_name, "mul") == 0 - || (strcmp(kernel_name, "maximum") == 0 && ctx->config.support_stream_processor) - || (strcmp(kernel_name, "minimum") == 0 && ctx->config.support_stream_processor)) + || (strcmp(kernel_name, "maximum") == 0 && !is_executed_on_sh) + || (strcmp(kernel_name, "minimum") == 0 && !is_executed_on_sh) + || (strcmp(kernel_name, "div") == 0 && !is_executed_on_sh)) { doShapeOptimized = FALSE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c index 6bb4dad..1a2a3aa 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c @@ -118,7 +118,11 @@ static vsi_bool op_setup memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); attr.dim_num = p->dim_num; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) { + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + } else { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; attr.is_const = TRUE; for(i = 0; i < p->dim_num; i++) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c index 26d3380..3522896 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c @@ -149,6 +149,7 @@ static vsi_bool op_setup curr->node->nn_param.grouped_conv2d.multiplier = p->multiplier; curr->node->nn_param.grouped_conv2d.weights = p->weights; curr->node->nn_param.grouped_conv2d.pad_type = p->pad_type; + curr->node->nn_param.grouped_conv2d.pad_mode = p->pad_mode; vsi_nn_internal_setup_node(self, curr); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c index 4f2ae60..f3818c7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c @@ -172,6 +172,7 @@ static vsi_status op_compute p_ext->padding_x_right = self->nn_param.conv2d.pad[1]; p_ext->padding_y_bottom = self->nn_param.conv2d.pad[3]; + p_ext->pad_mode = vsi_nn_get_vx_pad_mode(nn_param->pad_mode); //set ext2 relative parameters p_ext2->depth_multiplier = self->nn_param.conv2d.multiplier; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c index 0f7baf9..e872a3d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c @@ -181,10 +181,7 @@ static vsi_status op_compute p = &(self->nn_param.l2normalizescale); axis = p->axis; - if ( (inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1])) || - ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 && - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) - ) + if ( self->nn_param.l2normalizescale.local.use_internal_node ) { return vsi_nn_internal_compute_node( self ); } @@ -350,14 +347,16 @@ static vsi_bool op_setup if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one( self->graph, inputs[1] ) ) { + self->nn_param.l2normalizescale.local.use_internal_node = TRUE; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0); curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; vsi_nn_internal_setup_node( self, curr ); } - else if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 && - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) + else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 && + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) || + self->graph->ctx->config.support_stream_processor ) { vsi_nn_internal_tensor_t* output_tensor = NULL; vsi_nn_internal_tensor_t* reshape_tensor = NULL; @@ -365,6 +364,8 @@ static vsi_bool op_setup int32_t dim_num = inputs[0]->attr.dim_num; int32_t i = 0; + self->nn_param.l2normalizescale.local.use_internal_node = TRUE; + memcpy( &attr, &outputs[0]->attr, sizeof( attr ) ); attr.vtl = TRUE; attr.is_const = FALSE; @@ -382,7 +383,7 @@ static vsi_bool op_setup attr.size[i] = i == self->nn_param.l2normalizescale.axis ? inputs[0]->attr.size[i] : 1; } attr.dim_num = dim_num; - if (attr.dtype.vx_type != VSI_NN_TYPE_BFLOAT16) + if (attr.dtype.vx_type != VSI_NN_TYPE_BFLOAT16 && inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16) { attr.dtype.vx_type = VSI_NN_TYPE_BFLOAT16; attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; @@ -415,6 +416,8 @@ static vsi_status op_init vsi_status status = VSI_SUCCESS; uint32_t i = 0; + self->nn_param.l2normalizescale.local.use_internal_node = FALSE; + if (vsi_nn_compareVersion(self->graph, 1, 1, 13) == -1) { self->nn_param.l2normalizescale.axis = VSI_NN_L2NORMALIZESCALE_DEFAULT_AXIS; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index 74623e2..f8330b7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -39,7 +39,6 @@ #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) -#define VSI_NN_SUPPORT_AXIS (0) static vsi_status op_compute ( @@ -52,16 +51,17 @@ static vsi_status op_compute vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; float eps = self->nn_param.layernorm.eps; -#if VSI_NN_SUPPORT_AXIS - if ( 0 ) + int32_t axis = self->nn_param.layernorm.axis; + + if ( self->nn_param.layernorm.local->use_internal_node ) { return vsi_nn_internal_compute_node( self ); } -#endif param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_float32( param, "eps", eps ); + vsi_nn_kernel_param_add_int32( param, "axis", axis ); n = vsi_nn_kernel_selector( self->graph, "layer_norm", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); if ( n != NULL ) @@ -86,39 +86,43 @@ static vsi_bool op_setup ) { vsi_bool ret = TRUE; -#if VSI_NN_SUPPORT_AXIS + int32_t axis = 0; vsi_nn_internal_node_t* curr = NULL; -#endif if ( NULL == self ) { return FALSE; } -#if VSI_NN_SUPPORT_AXIS + + axis = self->nn_param.layernorm.axis; + vsi_nn_internal_init_node_wksp( self ); - if ( 0 ) + if ( axis != 0 && !self->graph->ctx->config.support_stream_processor) { vsi_nn_internal_tensor_t* mean_tensor = NULL; vsi_nn_internal_tensor_t* vari_tensor = NULL; vsi_nn_tensor_attr_t attr; - int32_t *axis = NULL; + int32_t *axis_array = NULL; + + self->nn_param.layernorm.local->use_internal_node = TRUE; memcpy( &attr, &inputs[0]->attr, sizeof( attr ) ); - attr.size[0] = 1; + attr.size[axis] = 1; attr.vtl = TRUE; attr.is_const = FALSE; attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + mean_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); vari_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MOMENTS, 0, 0); - axis = (int32_t*)\ - vsi_nn_internal_new_node_param(curr, sizeof(int32_t) * 4); - axis[0] = 0; + axis_array = (int32_t*)\ + vsi_nn_internal_new_node_param(curr, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + axis_array[0] = axis; - curr->node->nn_param.moments.axis = axis; + curr->node->nn_param.moments.axis = axis_array; curr->node->nn_param.moments.axis_num = 1; curr->inputs[0] = inputs[0]; curr->outputs[0] = mean_tensor->t; @@ -136,7 +140,6 @@ static vsi_bool op_setup vsi_nn_internal_setup_node( self, curr ); } else -#endif { ret = vsi_nn_op_common_setup(self, inputs, outputs); } @@ -211,14 +214,31 @@ static vsi_bool op_check return TRUE; } /* op_check() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.layernorm.axis = 0; + + self->nn_param.layernorm.local = (vsi_nn_layernorm_lcl_data *)malloc(sizeof(vsi_nn_layernorm_lcl_data)); + memset(self->nn_param.layernorm.local, 0x00, sizeof(vsi_nn_layernorm_lcl_data)); + self->nn_param.layernorm.local->use_internal_node = FALSE; + + return status; +} + static vsi_status op_deinit ( vsi_nn_node_t * self ) { -#if VSI_NN_SUPPORT_AXIS + vsi_nn_safe_free(self->nn_param.layernorm.local); + vsi_nn_internal_deinit_node_wksp( self ); -#endif + vsi_nn_op_common_deinit(self); return VSI_SUCCESS; @@ -231,7 +251,7 @@ extern "C" { DEF_OP_REG ( /* op_name */ LAYER_NORM, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c new file mode 100644 index 0000000..1758ac1 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c @@ -0,0 +1,259 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _lppool_local_data_t { + int32_t placeholder; +} lppool_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t* param = NULL; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + uint32_t i = 0; + uint32_t new_rank = 0; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + int32_t ksize_x = (int32_t)self->nn_param.lppool.ksize[0]; + int32_t ksize_y = (int32_t)self->nn_param.lppool.ksize[1]; + int32_t p = (int32_t)self->nn_param.lppool.p; + int32_t pad_left = (int32_t)self->nn_param.lppool.pad[0]; + int32_t pad_right = (int32_t)self->nn_param.lppool.pad[1]; + int32_t pad_top = (int32_t)self->nn_param.lppool.pad[2]; + int32_t pad_bottom = (int32_t)self->nn_param.lppool.pad[3]; + int32_t stride_x = (int32_t)self->nn_param.lppool.stride[0]; + int32_t stride_y = (int32_t)self->nn_param.lppool.stride[1]; + new_rank = 3; + + shapes[0][0] = inputs[0]->attr.size[0]; + shapes[0][1] = inputs[0]->attr.size[1]; + shapes[0][2] = inputs[0]->attr.size[2]; + shapes[1][0] = outputs[0]->attr.size[0]; + shapes[1][1] = outputs[0]->attr.size[1]; + shapes[1][2] = outputs[0]->attr.size[2]; + + for (i = 3; i < inputs[0]->attr.dim_num; i++) + { + shapes[0][2] = shapes[0][2] * inputs[0]->attr.size[i]; + shapes[1][2] = shapes[1][2] * outputs[0]->attr.size[i]; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], shapes[1], new_rank ); + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32(param, "ksize_x", ksize_x); + vsi_nn_kernel_param_add_int32(param, "ksize_y", ksize_y); + vsi_nn_kernel_param_add_int32(param, "pad_left", pad_left); + vsi_nn_kernel_param_add_int32(param, "pad_right", pad_right); + vsi_nn_kernel_param_add_int32(param, "pad_top", pad_top); + vsi_nn_kernel_param_add_int32(param, "pad_bottom", pad_bottom); + vsi_nn_kernel_param_add_int32(param, "stride_x", stride_x); + vsi_nn_kernel_param_add_int32(param, "stride_y", stride_y); + vsi_nn_kernel_param_add_int32(param, "p", p); + + self->n = (vx_node)vsi_nn_kernel_selector(self->graph,"lppool", + &reshape_tensors[0],_INPUT_NUM,&reshape_tensors[1],_OUTPUT_NUM,param); + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release(¶m); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(LPPOOL, 1, 1) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F32, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I16|Q_SYM) + IO_TYPE(D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_SYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_SYM, D_F32) + IO_TYPE(D_I16|Q_DFP, D_F32) + IO_TYPE(D_I8|Q_DFP, D_F32) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_F32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_I8|Q_SYM) + IO_TYPE(D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + END_IO_TYPE_DECL(LPPOOL) + + if (!VALIDATE_OP_IO_TYPES( + LPPOOL, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_size_t ksize[_cnt_of_array(self->nn_param.lppool.ksize)] = {0}; + vsi_size_t i = 0; + vsi_size_t pad[_cnt_of_array(self->nn_param.lppool.pad)] = {0}; + + for (i = 0; i < _cnt_of_array(self->nn_param.lppool.ksize); i++) + { + ksize[i] = self->nn_param.lppool.ksize[i]; + } + for (i = 0; i < _cnt_of_array(self->nn_param.lppool.pad); i++) + { + pad[i] = self->nn_param.lppool.pad[i]; + } + + vsi_nn_compute_padding( + inputs[0]->attr.size, + ksize, + self->nn_param.lppool.stride, + NULL, + self->nn_param.lppool.pad_type, + pad + ); + for (i = 0; i < _cnt_of_array(self->nn_param.lppool.ksize); i++) + { + self->nn_param.lppool.ksize[i] = (uint32_t)ksize[i]; + } + for (i = 0; i < _cnt_of_array(self->nn_param.lppool.pad); i++) + { + self->nn_param.lppool.pad[i] = (uint32_t)pad[i]; + } + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + self->nn_param.lppool.ksize[0], + &self->nn_param.lppool.pad[0], + self->nn_param.lppool.stride[0], + 0, + VSI_NN_ROUND_CEIL + ); + outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[1], + self->nn_param.lppool.ksize[1], + &self->nn_param.lppool.pad[1], + self->nn_param.lppool.stride[1], + 0, + VSI_NN_ROUND_CEIL + ); + for (i = 2; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + self->nn_param.lppool.p = 2; + + return status; +} /* op_init() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ LPPOOL, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c index a6c5c63..bcdc2d9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_util.h" @@ -123,10 +123,11 @@ static vsi_bool op_setup int32_t ifco_start_index = 0; vsi_nn_tensor_attr_t attr; int32_t i = 0; + vsi_bool ret = FALSE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - if( NULL == self ) + if ( NULL == self ) { return FALSE; } @@ -160,13 +161,15 @@ static vsi_bool op_setup attr.size[1] = 1; attr.dim_num = 2; t0 = vsi_nn_reshape_tensor(self->graph, inputs[LSTMUNIT_ACT_DATA_BI + i], attr.size, attr.dim_num); + CHECK_PTR_FAIL_GOTO( t0, "create tensor fail.", final ); - if( dst_dtype.vx_type != t0->attr.dtype.vx_type + if ( dst_dtype.vx_type != t0->attr.dtype.vx_type && dst_dtype.qnt_type != t0->attr.dtype.qnt_type ) { p->local.tensors[LSTMUNIT_ACT_TENSOR_BI + i] = vsi_nn_ConvertTensorDtype( self->graph, t0, &dst_dtype ); - vsi_nn_ReleaseTensor( &t0 ); + + vsi_safe_release_tensor(t0); } else { @@ -182,13 +185,14 @@ static vsi_bool op_setup attr.size[1] = 1; attr.dim_num = 2; t1 = vsi_nn_reshape_tensor(self->graph, inputs[LSTMUNIT_ACT_LN_WI + i], attr.size, attr.dim_num); + CHECK_PTR_FAIL_GOTO( t1, "create tensor fail.", final ); - if( dst_dtype.vx_type != t1->attr.dtype.vx_type + if ( dst_dtype.vx_type != t1->attr.dtype.vx_type && dst_dtype.qnt_type != t1->attr.dtype.qnt_type ) { p->local.tensors[LSTMUNIT_ACT_TENSOR_LN_WI + i] = vsi_nn_ConvertTensorDtype( self->graph, t1, &dst_dtype ); - vsi_nn_ReleaseTensor( &t1 ); + vsi_safe_release_tensor(t1); } else { @@ -226,7 +230,9 @@ static vsi_bool op_setup outputs[LSTMUNIT_ACT_HSTATE_OUT]->attr.size[3] = outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[3]; } - return TRUE; + ret = TRUE; +final: + return ret; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c index 23b987d..9df9c1b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c @@ -184,15 +184,13 @@ static vsi_bool op_setup if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs ); - } if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) { - outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; - memcpy( outputs[1]->attr.size, outputs[0]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); + outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; + memcpy( outputs[1]->attr.size, outputs[0]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } - return ret; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c index c1d35eb..2c7dba9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c @@ -138,6 +138,7 @@ static vsi_status op_compute vsi_status status; vx_nn_pad_params_t p; vsi_nn_tensor_t *convert_tensor = NULL; + vsi_bool release_intermediate_tensor = TRUE; status = VSI_FAILURE; if (VSI_SUCCESS != vsi_nn_InitPadParameter(self, &p)) @@ -164,8 +165,8 @@ static vsi_status op_compute } else { - convert_tensor = vsi_nn_reshape_tensor( self->graph, - inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num ); + convert_tensor = inputs[0]; + release_intermediate_tensor = FALSE; } self->n = vxTensorPadNode( self->graph->g, @@ -182,7 +183,10 @@ static vsi_status op_compute final: vsi_nn_DeinitPadParameter(&p); - vsi_safe_release_tensor(convert_tensor); + if (release_intermediate_tensor) + { + vsi_safe_release_tensor(convert_tensor); + } return status; } /* op_compute() */ @@ -266,7 +270,7 @@ static vsi_bool op_setup if (front + back + inputs[0]->attr.size[i] != outputs[0]->attr.size[i]) { VSILOGE("Error:output shape[%u] not equal front padding[%u] + input shape[%u] + back padding[%u]", - outputs[0]->attr.size[i], front, back); + outputs[0]->attr.size[i], front, inputs[0]->attr.size[i], back); return FALSE; } } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c index bd01a72..8f6227e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c @@ -45,31 +45,6 @@ typedef struct _pad2_local_data_t { #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -static int32_t _get_vx_pad_mode(vx_enum mode) -{ - int32_t pad_mode = 0; - switch (mode) - { - case VSI_NN_PAD_MODE_CONSTANT: - pad_mode = VX_PAD_CONSTANT; - break; - case VSI_NN_PAD_MODE_REPLICATE: - pad_mode = VX_PAD_REPLICATE; - break; - case VSI_NN_PAD_MODE_SYMMETRIC: - pad_mode = VX_PAD_MIRROR_SYMMETRIC; - break; - case VSI_NN_PAD_MODE_REFLECT: - pad_mode = VX_PAD_MIRROR_REFLECT; - break; - default: - VSILOGE("Wrong pad_mode value"); - break; - } - - return pad_mode; -} - static int32_t _check_mirror_pad_size ( vx_enum mode, @@ -122,7 +97,7 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_pad2_param *p = &self->nn_param.pad2; vsi_nn_kernel_param_t * param; - int32_t pad_mode = _get_vx_pad_mode(p->mode); + int32_t pad_mode = vsi_nn_get_vx_pad_mode(p->mode); param = vsi_nn_kernel_param_create(); @@ -230,7 +205,7 @@ static vsi_bool op_setup if (front + back + inputs[0]->attr.size[i] != outputs[0]->attr.size[i]) { VSILOGE("Error:output shape[%u] not equal front padding[%u] + input shape[%u] + back padding[%u]", - outputs[0]->attr.size[i], front, back); + outputs[0]->attr.size[i], front, inputs[0]->attr.size[i], back); return FALSE; } } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c index eadb94a..38409d6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c @@ -141,7 +141,7 @@ static vsi_status op_optimize char tensor_name[128]; dim = inputs[0]->attr.dim_num; - if(FALSE == _is_pool1d(self, inputs)) + if (FALSE == _is_pool1d(self, inputs)) { return VSI_SUCCESS; } @@ -155,9 +155,9 @@ static vsi_status op_optimize { /* reshape 3d input (xcn) --> 4d input (whcn) */ shape[0] = inputs[0]->attr.size[0];//width - shape[1] = 1;//height - shape[2] = inputs[0]->attr.size[1]; - shape[3] = inputs[0]->attr.size[2]; + shape[1] = inputs[0]->attr.size[1]; + shape[2] = inputs[0]->attr.size[2]; + shape[3] = 1;//batch dim = 4; local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim); } @@ -165,9 +165,9 @@ static vsi_status op_optimize { /* reshape 3d output(xcn) --> 4d output(whcn) */ shape[0] = outputs[0]->attr.size[0];//width - shape[1] = 1;//height - shape[2] = outputs[0]->attr.size[1]; - shape[3] = outputs[0]->attr.size[2]; + shape[1] = outputs[0]->attr.size[1]; + shape[2] = outputs[0]->attr.size[2]; + shape[3] = 1;//batch dim = 4; local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim); if (local->reshaped_output && local->reshaped_output->t) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index f913afd..c7f47af 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -94,8 +94,11 @@ static vsi_bool op_setup p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || - p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422 ) { uint32_t i = 0; @@ -160,7 +163,14 @@ static vsi_bool op_setup curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num; curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; - curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + if (layout == VSI_NN_DEST_LAYOUT_NHWC) + { + curr->outputs[0] = preprocess_tensor->t; + } + else + { + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + } vsi_nn_internal_setup_node(self, curr); } @@ -470,6 +480,57 @@ static vsi_bool op_setup vsi_nn_internal_setup_node(self, curr); } break; + case VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422: + case VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422: + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV422, 0, 0 ); + + if (p->reverse_channel) + { + curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[0]; + } + else + { + curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1]; + curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[2]; + } + + if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422) + { + curr->node->nn_param.pre_process_yuv422.yuv422_type = 0; + } + else + { + curr->node->nn_param.pre_process_yuv422.yuv422_type = 1; + } + + curr->node->nn_param.pre_process_yuv422.rgb_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv422.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_yuv422.rect.left = p->rect.left; + curr->node->nn_param.pre_process_yuv422.rect.top = p->rect.top; + curr->node->nn_param.pre_process_yuv422.rect.width = p->rect.width; + curr->node->nn_param.pre_process_yuv422.rect.height = p->rect.height; + curr->node->nn_param.pre_process_yuv422.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_yuv422.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_yuv422.perm = p->perm; + curr->node->nn_param.pre_process_yuv422.dim_num = p->dim_num; + + curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; + if (layout == VSI_NN_DEST_LAYOUT_NHWC) + { + curr->outputs[0] = preprocess_tensor->t; + } + else + { + curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; + } + + vsi_nn_internal_setup_node(self, curr); + } + break; default: { VSILOGE( "Not support this type!(PRE_PROCESS)\n"); @@ -479,10 +540,13 @@ static vsi_bool op_setup } if ( p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422 || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP ) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c index 176aabf..a60f446 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c @@ -87,12 +87,12 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8, D_U8, D_U8|Q_ASYM) - IO_TYPE(D_U8, D_U8, D_I8|Q_DFP) - IO_TYPE(D_U8, D_U8, D_I16|Q_DFP) - IO_TYPE(D_U8, D_U8, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_SYM) END_IO_TYPE_DECL(PRE_PROCESS_NV12) - if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_NV12, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_NV12, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c index d98910b..bcac93c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c @@ -87,10 +87,10 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM) - IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP) - IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP) - IO_TYPE(D_U8, D_U8, D_U8, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_SYM) END_IO_TYPE_DECL(PRE_PROCESS_YUV420) if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV420, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c new file mode 100644 index 0000000..b9c4daf --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c @@ -0,0 +1,238 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _pre_process_yuv422_local_data_t { + int32_t placeholder; +} pre_process_yuv422_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_yuv422.local->scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_yuv422.local->scale_y ); + vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_yuv422.rect.left ); + vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_yuv422.rect.top ); + vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv422.r_mean ); + vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv422.g_mean ); + vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv422.b_mean ); + vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv422.rgb_scale ); + vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv422.reverse_channel ); + vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv422.local->enable_perm ); + vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv422.local->enable_copy ); + vsi_nn_kernel_param_add_int32( param, "yuv422_type", self->nn_param.pre_process_yuv422.yuv422_type ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_yuv422", inputs, 1, outputs, 1, param ); + if( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if(param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PRE_PROCESS_YUV422, 1, 1) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM) + END_IO_TYPE_DECL(PRE_PROCESS_YUV422) + if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV422, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_nn_pre_process_yuv422_param * p = NULL; + uint32_t i = 0; + p = (vsi_nn_pre_process_yuv422_param *)&(self->nn_param.pre_process_yuv422); + + if (p->rect.width == 0 || p->rect.height == 0) + { + VSILOGE("Image size cannot be zero !(PRE_PROCESS_YUV422)\n"); + return FALSE; + } + else + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_YUV422)\n"); + return FALSE; + } + } + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + if (p->output_attr.dim_num > 0) + { + for (i = 0; i < p->output_attr.dim_num; i++) + { + if (p->output_attr.size[i] == 0) + { + VSILOGE("output size cannot be zero!(PRE_PROCESS_YUV422)\n"); + return FALSE; + } + else + { + outputs[0]->attr.dim_num = p->output_attr.dim_num; + outputs[0]->attr.size[i] = p->output_attr.size[i]; + } + } + } + else + { + VSILOGE("output dim num cannot be zero!(PRE_PROCESS_YUV422)\n"); + return FALSE; + } + } + + p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]); + p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]); + + p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15))); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + if (self->nn_param.pre_process_yuv422.local != NULL) + { + uint32_t i = 0; + for (i = 0; i < _VSI_NN_PRE_PROCESS_YUV422_LOCAL_TENSOR_NUM; i++) + { + if (self->nn_param.pre_process_yuv422.local->local_tensor[i] != NULL) + { + vxReleaseTensor(&(self->nn_param.pre_process_yuv422.local->local_tensor[i])); + self->nn_param.pre_process_yuv422.local->local_tensor[i] = NULL; + } + } + free(self->nn_param.pre_process_yuv422.local); + self->nn_param.pre_process_yuv422.local = NULL; + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + self->nn_param.pre_process_yuv422.local = + (vsi_nn_pre_process_yuv422_lcl_data *)malloc(sizeof(vsi_nn_pre_process_yuv422_lcl_data)); + + if (NULL == self->nn_param.pre_process_yuv422.local) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.pre_process_yuv422.local, 0, sizeof(vsi_nn_pre_process_yuv422_lcl_data)); + + return status; +} /* op_init() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PRE_PROCESS_YUV422, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index 5a37151..b5489bf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -158,7 +158,7 @@ static vsi_bool _check_is_sp_supported_type { int32_t * axes = self->nn_param.reduce.local2->axes; int32_t axes_num = self->nn_param.reduce.local2->axes_num; - vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { {0} }; int32_t axis_in[VSI_NN_MAX_DIM_NUM] = {0}; int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; int32_t i = 0; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c index ced3cd7..04c4271 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c @@ -57,7 +57,7 @@ static vsi_status op_compute int32_t * axis = self->nn_param.reduce_mean_internal.axis; int32_t axis_num = self->nn_param.reduce_mean_internal.axis_num; float scale = self->nn_param.reduce_mean_internal.scale; - vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { {0} }; int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; uint32_t axis_size = 0; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c index a1ba17c..5d1c2d4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c @@ -31,6 +31,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute ( @@ -39,22 +40,16 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_t n = NULL; - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1, - 0, - 0, - outputs[0]->t - ); - - if( NULL != self->n ) + n = vsi_nn_kernel_selector( self->graph, "relu1", inputs, 1, outputs, 1, NULL ); + if ( n == NULL ) { - status = VSI_SUCCESS; + status = VSI_FAILURE; } + self->n = (vx_node)n; + return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c index 9020e7d..c9fd754 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c @@ -31,7 +31,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" - +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute ( @@ -40,22 +40,16 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_t n = NULL; - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6, - 0, - 0, - outputs[0]->t - ); - - if( NULL != self->n ) + n = vsi_nn_kernel_selector( self->graph, "relu6", inputs, 1, outputs, 1, NULL ); + if ( n == NULL ) { - status = VSI_SUCCESS; + status = VSI_FAILURE; } + self->n = (vx_node)n; + return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c index 1cbf229..ea54ce4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c @@ -32,7 +32,7 @@ #include "vsi_nn_log.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" - +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute ( @@ -41,18 +41,18 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; float top = self->nn_param.relun.relu_clamp_top; float bottom = self->nn_param.relun.relu_clamp_bottom; - vsi_enum func = -1; + vsi_nn_kernel_node_t n = NULL; - if(top == 1 && bottom == -1) + if (top == 1 && bottom == -1) { - func = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1; + n = vsi_nn_kernel_selector( self->graph, "relu1", inputs, 1, outputs, 1, NULL ); } - else if(top == 6) + else if (top == 6) { - func = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6; + n = vsi_nn_kernel_selector( self->graph, "relu6", inputs, 1, outputs, 1, NULL ); } else { @@ -60,19 +60,13 @@ static vsi_status op_compute return VSI_FAILURE; } - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - func, - 0, - 0, - outputs[0]->t - ); - - if( NULL != self->n ) + if ( n == NULL ) { - status = VSI_SUCCESS; + status = VSI_FAILURE; } + + self->n = (vx_node)n; + return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c index 295b6ee..2a77c5c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" @@ -53,6 +53,7 @@ static vsi_status _create_local_tensor vsi_nn_repeat_lcl_data *local = self->nn_param.repeat.local; vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; uint32_t i = 0; + vsi_status status = VSI_FAILURE; if (axis == -1) { @@ -63,6 +64,7 @@ static vsi_status _create_local_tensor } local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, 1); + CHECK_PTR_FAIL_GOTO( local->reshaped_input, "create tensor fail.", final ); shape[0] = 1; for(i = 0; i < outputs[0]->attr.dim_num; i++) @@ -70,6 +72,7 @@ static vsi_status _create_local_tensor shape[0] *= outputs[0]->attr.size[i]; } local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, 1); + CHECK_PTR_FAIL_GOTO( local->reshaped_output, "create tensor fail.", final ); } if (repeat_host) @@ -103,9 +106,12 @@ static vsi_status _create_local_tensor attr.dim_num = 2; local->repeat_tensor = vsi_nn_CreateTensorFromData(self->graph, (uint8_t*)repeat_host, &attr); + CHECK_PTR_FAIL_GOTO( local->repeat_tensor, "create tensor fail.", final ); } - return VSI_SUCCESS; + status = VSI_SUCCESS; +final: + return status; } static vsi_status op_compute diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c index 6ea0fc0..e1cfdaa 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c @@ -36,6 +36,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" +#include "vsi_nn_error.h" VSI_NN_SUPPRESS_DEPRECATED_BEGIN @@ -79,6 +80,7 @@ static vsi_status op_compute vsi_nn_tensor_t *tmp_tensor = NULL; tmp_tensor = vsi_nn_reshape_tensor( self->graph, outputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num ); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "create tensor fail.", final ); self->n = vxTensorCopyNode(self->graph->g, inputs[0]->t, tmp_tensor->t); @@ -88,7 +90,7 @@ static vsi_status op_compute status = VSI_FAILURE; } VSILOGD("Create a copy node for reshape"); - +final: vsi_safe_release_tensor(tmp_tensor); #endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c index fd544a8..002b39b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c @@ -135,6 +135,16 @@ static vsi_status op_optimize } else { + int32_t half_pixel_centers = self->nn_param.resize.half_pixel_centers; + vsi_size_t * input_size = inputs[0]->attr.size; + vsi_size_t * output_size = outputs[0]->attr.size; + + if ( (output_size[0] % input_size[0] == 0) && (output_size[1] % input_size[1] == 0) && + half_pixel_centers == TRUE && self->nn_param.resize.type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR ) + { + self->nn_param.resize.half_pixel_centers = FALSE; + } + return VSI_SUCCESS; } } /* op_optimize() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c index d5f3e54..282de4e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c @@ -157,18 +157,13 @@ static vsi_bool op_setup vsi_bool use_virtual_tensor = TRUE; uint32_t kernel_h = 1; uint32_t kernel_w = 1; - vsi_bool ret = FALSE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); - p->local = (vsi_nn_rnncell_ovxlib_lcl_data_t*) - malloc(sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t)); - CHECK_PTR_FAIL_GOTO( p->local, "Create buffer fail.", final ); - ret = TRUE; memset(p->local, 0x00, sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t)); memset(&attr, 0x00, sizeof(attr)); - p->local->multi_batch = (vsi_bool)(inputs[RNNCELL_INPUT_INPUT]->attr.size[1]); + p->local->multi_batch = (inputs[RNNCELL_INPUT_INPUT]->attr.size[1]>1); if( inputs[RNNCELL_INPUT_INPUT]->attr.dtype.qnt_type != inputs[RNNCELL_INPUT_WEIGHT_I]->attr.dtype.qnt_type) @@ -199,9 +194,6 @@ static vsi_bool op_setup { is_input_fc_on_tp = TRUE; } - /* TODO: now, all fc on tp because can't fetch the HW feature */ - is_input_fc_on_tp = TRUE; - is_hstate_fc_on_tp = TRUE; setup_op_shapes(self, inputs, outputs); @@ -212,7 +204,7 @@ static vsi_bool op_setup input_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self, inputs[RNNCELL_INPUT_INPUT], inputs[RNNCELL_INPUT_WEIGHT_I], - inputs[RNNCELL_INPUT_BIAS], + inputs[RNNCELL_INPUT_BIAS_I], &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], use_virtual_tensor); if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL) @@ -237,7 +229,7 @@ static vsi_bool op_setup tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, inputs[RNNCELL_INPUT_WEIGHT_I], - inputs[RNNCELL_INPUT_BIAS], + inputs[RNNCELL_INPUT_BIAS_I], kernel_h, kernel_w, &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], use_virtual_tensor); @@ -273,7 +265,7 @@ static vsi_bool op_setup hstate_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self, inputs[RNNCELL_INPUT_H_STATE], inputs[RNNCELL_INPUT_WEIGHT_H], - NULL, + inputs[RNNCELL_INPUT_BIAS_H], &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H], use_virtual_tensor); } @@ -289,7 +281,7 @@ static vsi_bool op_setup tmp = vsi_nn_rnn_create_nn_fc(self, hstate_input_tensor->t, inputs[RNNCELL_INPUT_WEIGHT_H], - NULL, + inputs[RNNCELL_INPUT_BIAS_H], kernel_h, kernel_w, &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H], use_virtual_tensor); @@ -331,8 +323,7 @@ static vsi_bool op_setup vsi_nn_internal_setup_node(self, curr); } -final: - return ret; + return TRUE; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c index 8121363..12668f0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c @@ -101,6 +101,7 @@ static vsi_bool op_check IO_TYPE(D_F32, D_F32, D_F16) IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_BF16, D_BF16, D_F32) + IO_TYPE(D_BF16, D_BF16, D_BF16) IO_TYPE(D_F32, D_F32, D_BF16) /* HW 9.0 */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c index b4c8666..e6ba3bf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c @@ -34,6 +34,7 @@ #include "vsi_nn_log.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_constraint_check.h" +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute ( @@ -42,22 +43,16 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_t n = NULL; - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RSQRT, - 0, - 0, - outputs[0]->t - ); - - if( NULL != self->n ) + n = vsi_nn_kernel_selector( self->graph, "rsqrt", inputs, 1, outputs, 1, NULL ); + if ( n == NULL ) { - status = VSI_SUCCESS; + status = VSI_FAILURE; } + self->n = (vx_node)n; + return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c new file mode 100644 index 0000000..99f8e40 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c @@ -0,0 +1,171 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_math.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _scatter_elements_local_data_t { + int32_t placeholder; +} scatter_elements_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_scatter_elements_param * p = NULL; + + if ( NULL == self ) + { + return VSI_FAILURE; + } + status = VSI_FAILURE; + + p = &(self->nn_param.scatter_elements); + + // Add params + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "axis", p->axis ); + vsi_nn_kernel_param_add_int32( param, "reduction", p->reduction ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "scatter_elements", + inputs, 3, + outputs, 1, param ); + + vsi_nn_kernel_param_release( ¶m ); + + if ( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(SCATTER_ELEMENTS, 3, 1) + IO_TYPE(D_I32, D_I32, D_I32, D_I32) + IO_TYPE(D_F32, D_I32, D_F32, D_F32) + IO_TYPE(D_F16, D_I32, D_F16, D_F16) + IO_TYPE(D_BF16, D_I32, D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I32, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I32, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I32, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I32, D_I16|Q_DFP, D_I16|Q_SYM) + END_IO_TYPE_DECL(SCATTER_ELEMENTS) + if (!VALIDATE_OP_IO_TYPES(SCATTER_ELEMENTS, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i = 0; + uint32_t indices_dims = inputs[1]->attr.dim_num; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + + for (i = 0; i < indices_dims; i++) + { + if (inputs[1]->attr.size[i] != inputs[2]->attr.size[i]) + { + VSILOGE("Indices vs updates dimensions differs at position=%d, %d vs %d", i, + inputs[1]->attr.size[i], inputs[2]->attr.size[i]); + return FALSE; + } + } + + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SCATTER_ELEMENTS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c index c95d75e..94e0110 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c @@ -142,6 +142,29 @@ static vsi_bool op_check IO_TYPE(D_I8, D_F16, D_F16, D_F16) IO_TYPE(D_I8, D_I32, D_I32, D_I32) IO_TYPE(D_I8, D_F32, D_F32, D_F32) + IO_TYPE(D_I8, D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8, D_F16, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8, D_F16, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8, D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8, D_F16, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I8, D_F16, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8, D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8, D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_I8, D_I8|Q_SYM, D_F16, D_I8|Q_SYM) + IO_TYPE(D_I8, D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8, D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_I8, D_I16|Q_SYM, D_F16, D_I16|Q_SYM) + IO_TYPE(D_I8, D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8, D_I8|Q_ASYM, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8, D_I8|Q_SYM, D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8, D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I8, D_I16|Q_ASYM, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I8, D_I16|Q_SYM, D_I16|Q_SYM, D_F16) + IO_TYPE(D_I8, D_BF16, D_BF16, D_BF16) + IO_TYPE(D_BOOL8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_BOOL8, D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM) IO_TYPE(D_BOOL8, D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM) @@ -155,7 +178,7 @@ static vsi_bool op_check IO_TYPE(D_BOOL8, D_F16, D_I8|Q_SYM, D_F16) IO_TYPE(D_BOOL8, D_F16, D_I16|Q_DFP, D_F16) IO_TYPE(D_BOOL8, D_F16, D_I16|Q_ASYM, D_F16) - IO_TYPE(D_BOOL8, D_I16|Q_SYM, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_F16, D_I16|Q_SYM, D_F16) IO_TYPE(D_BOOL8, D_U8|Q_ASYM, D_F16, D_F16) IO_TYPE(D_BOOL8, D_I8|Q_DFP, D_F16, D_F16) IO_TYPE(D_BOOL8, D_I8|Q_ASYM, D_F16, D_F16) @@ -170,6 +193,28 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_BOOL8, D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_F16, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_BOOL8, D_F16, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_BOOL8, D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_BOOL8, D_F16, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_BOOL8, D_F16, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_BOOL8, D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I8|Q_SYM, D_F16, D_I8|Q_SYM) + IO_TYPE(D_BOOL8, D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_BOOL8, D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_BOOL8, D_I16|Q_SYM, D_F16, D_I16|Q_SYM) + IO_TYPE(D_BOOL8, D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_BOOL8, D_I8|Q_ASYM, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_BOOL8, D_I8|Q_SYM, D_I8|Q_SYM, D_F16) + IO_TYPE(D_BOOL8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_BOOL8, D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_BOOL8, D_I16|Q_ASYM, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_BOOL8, D_I16|Q_SYM, D_I16|Q_SYM, D_F16) + IO_TYPE(D_BOOL8, D_BF16, D_BF16, D_BF16) END_IO_TYPE_DECL(SELECT) if (!VALIDATE_OP_IO_TYPES(SELECT, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c index bf7566c..d9c0246 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c @@ -32,7 +32,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" - +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute ( @@ -41,22 +41,16 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_t n = NULL; - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTRELU, - 0, - 0, - outputs[0]->t - ); - - if( NULL != self->n ) + n = vsi_nn_kernel_selector( self->graph, "softrelu", inputs, 1, outputs, 1, NULL ); + if ( n == NULL ) { - status = VSI_SUCCESS; + status = VSI_FAILURE; } + self->n = (vx_node)n; + return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c index 599c78a..a15c4dc 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c @@ -34,6 +34,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" +#include "vsi_nn_error.h" #include "utils/vsi_nn_constraint_check.h" static vsi_status op_compute @@ -47,7 +48,37 @@ static vsi_status op_compute vx_nn_reorg_params_ext_t param; vsi_nn_tensor_t *block_size_tensor = NULL; vsi_nn_tensor_t *pad_tensor = NULL; + vsi_nn_tensor_t *input_tensor = NULL; + vsi_nn_tensor_t *output_tensor = NULL; vsi_nn_tensor_attr_t attr; + vsi_bool need_release_tensor = TRUE; + int32_t block_size[2] = {1, 1}; + + block_size[0] = self->nn_param.space2batch.block_size[0]; + if (vsi_nn_is_3d_tensor(inputs[0])) + { + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{1}}; + memcpy(shape[0], inputs[0]->attr.size, sizeof(shape[0])); + memcpy(shape[1], outputs[0]->attr.size, sizeof(shape[1])); + shape[0][3] = shape[0][2]; + shape[0][2] = shape[0][1]; + shape[0][1] = 1; + shape[1][3] = shape[1][2]; + shape[1][2] = shape[1][1]; + shape[1][1] = 1; + + input_tensor = vsi_nn_reshape_tensor(self->graph, inputs[0], shape[0], 4); + CHECK_PTR_FAIL_GOTO( input_tensor, "craete tensor fail.", final ); + output_tensor = vsi_nn_reshape_tensor(self->graph, outputs[0], shape[1], 4); + CHECK_PTR_FAIL_GOTO( output_tensor, "craete tensor fail.", final ); + } + else + { + block_size[1] = self->nn_param.space2batch.block_size[1]; + need_release_tensor = FALSE; + input_tensor = inputs[0]; + output_tensor = outputs[0]; + } memset(¶m, 0, sizeof(vx_nn_reorg_params_t)); memset(&attr, 0, sizeof(attr)); @@ -58,13 +89,9 @@ static vsi_status op_compute attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; block_size_tensor = vsi_nn_CreateTensorFromData( self->graph, - (uint8_t *)self->nn_param.space2batch.block_size, + (uint8_t *)block_size, &attr); - if( NULL == block_size_tensor ) - { - VSILOGE("Create block_size_tensor fail.(space2batch)"); - return VSI_FAILURE; - } + CHECK_PTR_FAIL_GOTO( block_size_tensor, "craete tensor fail.", final ); memset(&attr, 0, sizeof(attr)); attr.size[0] = 4; @@ -76,31 +103,32 @@ static vsi_status op_compute self->graph, (uint8_t *)self->nn_param.space2batch.pad, &attr); - if( NULL == pad_tensor ) - { - VSILOGE("Create pad_tensor fail.(space2batch)"); - vsi_nn_ReleaseTensor(&block_size_tensor); - block_size_tensor = NULL; - return VSI_FAILURE; - } + CHECK_PTR_FAIL_GOTO( pad_tensor, "craete tensor fail.", final ); - self->nn_param.space2batch.local.block_size_tensor = block_size_tensor; - self->nn_param.space2batch.local.pad_tensor = pad_tensor; param.base.block_size = REQUIRED_IO(block_size_tensor); param.pad = OPTIONAL_IO(pad_tensor); param.base.type = VX_REORG_SPACE_TO_BATCH_ND; self->n = vxReorgLayer2( self->graph->g, - inputs[0]->t, + input_tensor->t, (vx_nn_reorg_params_t *)¶m, sizeof(vx_nn_reorg_params_ext_t), - outputs[0]->t); + output_tensor->t); - if( NULL != self->n ) + if ( NULL != self->n ) { status = VSI_SUCCESS; } +final: + if (need_release_tensor) + { + vsi_safe_release_tensor(input_tensor); + vsi_safe_release_tensor(output_tensor); + } + vsi_safe_release_tensor(block_size_tensor); + vsi_safe_release_tensor(pad_tensor); + return status; } /* op_compute() */ @@ -113,14 +141,13 @@ static vsi_bool op_check { vsi_bool ret = FALSE; - if (inputs[0]->attr.dim_num != 4) + if (inputs[0]->attr.dim_num < 3) { - VSILOGE("The input tensor shape must be 4-D!(space2batch)"); + VSILOGE("The input tensor shape must be 3D or 4D!(space2batch)"); return FALSE; } - if(self->nn_param.space2batch.block_size[0] < 0 - || self->nn_param.space2batch.block_size[1] < 0 + if (self->nn_param.space2batch.block_size[0] < 0 || self->nn_param.space2batch.pad[0] < 0 || self->nn_param.space2batch.pad[1] < 0 || self->nn_param.space2batch.pad[2] < 0 @@ -145,38 +172,45 @@ static vsi_bool op_setup vsi_nn_space2batch_param * p; p = (vsi_nn_space2batch_param *)&(self->nn_param.space2batch); - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - outputs[0]->attr.size[3] = - inputs[0]->attr.size[3] * p->block_size[0] * p->block_size[1]; - outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; - outputs[0]->attr.size[1] = - (p->pad[2] + p->pad[3] + inputs[0]->attr.size[1]) / p->block_size[1]; - outputs[0]->attr.size[0] = - (p->pad[0] + p->pad[1] + inputs[0]->attr.size[0]) / p->block_size[0]; - outputs[0]->attr.dim_num = 4; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + + if (vsi_nn_is_3d_tensor(inputs[0])) + { + outputs[0]->attr.size[2] = + inputs[0]->attr.size[2] * p->block_size[0]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[0]->attr.size[0] = + (p->pad[0] + p->pad[1] + inputs[0]->attr.size[0]) / p->block_size[0]; + } + else + { + outputs[0]->attr.size[3] = + inputs[0]->attr.size[3] * p->block_size[0] * p->block_size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[1] = + (p->pad[2] + p->pad[3] + inputs[0]->attr.size[1]) / p->block_size[1]; + outputs[0]->attr.size[0] = + (p->pad[0] + p->pad[1] + inputs[0]->attr.size[0]) / p->block_size[0]; + } } return TRUE; } /* op_setup() */ -static vsi_status op_deinit +static vsi_status op_init ( vsi_nn_node_t * self ) { - if (self->nn_param.space2batch.local.block_size_tensor != NULL) - { - vsi_nn_ReleaseTensor(&(self->nn_param.space2batch.local.block_size_tensor)); - } - if (self->nn_param.space2batch.local.pad_tensor != NULL) - { - vsi_nn_ReleaseTensor(&(self->nn_param.space2batch.local.pad_tensor)); - } - vsi_nn_op_common_deinit(self); + vsi_status status = VSI_SUCCESS; + vsi_nn_space2batch_param *p = &self->nn_param.space2batch; - return VSI_SUCCESS; -} /* op_deinit() */ + memset(p->pad, 0, sizeof(p->pad)); + + return status; +} /* op_init() */ #ifdef __cplusplus extern "C" { @@ -185,9 +219,9 @@ extern "C" { DEF_OP_REG ( /* op_name */ SPACE2BATCH, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, - /* deinit */ op_deinit, + /* deinit */ vsi_nn_op_common_deinit, /* check */ op_check, /* setup */ op_setup, /* optimize */ NULL, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c index e711b48..5ae3844 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c @@ -32,7 +32,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" - +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute ( @@ -41,22 +41,16 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_t n = NULL; - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQRT, - 0, - 0, - outputs[0]->t - ); - - if( NULL != self->n ) + n = vsi_nn_kernel_selector( self->graph, "sqrt", inputs, 1, outputs, 1, NULL ); + if ( n == NULL ) { - status = VSI_SUCCESS; + status = VSI_FAILURE; } + self->n = (vx_node)n; + return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c index 5fe93f7..3609aad 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c @@ -88,7 +88,7 @@ static vsi_bool op_check } } - ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs); + ret = vsi_nn_OpCheck(VSI_NN_OP_DATACONVERT, self, inputs, outputs); return ret; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c index fad9cbc..bf12b96 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c @@ -53,6 +53,7 @@ static vsi_bool setup_op_shapes vsi_size_t num_units = 0; vsi_size_t output_size = 0; vsi_size_t batch_size = 0; + vsi_bool use_virtual_tensor = TRUE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); @@ -82,6 +83,17 @@ static vsi_bool setup_op_shapes inputs[RNN_INPUT_H_STATE] = output_tensor->t; } + if( !outputs[RNN_OUTPUT_H_STATE] ) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + memcpy( &attr.dtype, &outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[RNN_OUTPUT_H_STATE] = output_tensor->t; + } + /* output */ if( VSI_NN_DIM_AUTO == outputs[RNN_OUTPUT_OUTPUT]->attr.dim_num ) { @@ -91,6 +103,14 @@ static vsi_bool setup_op_shapes outputs[RNN_OUTPUT_OUTPUT]->attr.dim_num = 3; } + /* output_state_out */ + if( VSI_NN_DIM_AUTO == outputs[RNN_OUTPUT_H_STATE]->attr.dim_num ) + { + outputs[RNN_OUTPUT_H_STATE]->attr.size[0] = output_size; + outputs[RNN_OUTPUT_H_STATE]->attr.size[1] = batch_size; + outputs[RNN_OUTPUT_H_STATE]->attr.dim_num = 2; + } + return TRUE; } @@ -207,7 +227,7 @@ static vsi_bool op_setup /* rnncell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, - &outputs[RNNCELL_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); + &outputs[RNN_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); rnncell_out1 = output_tensor->t; @@ -221,8 +241,8 @@ static vsi_bool op_setup curr->inputs[RNNCELL_INPUT_WEIGHT_I] = inputs[RNN_INPUT_WEIGHT_I]; curr->inputs[RNNCELL_INPUT_WEIGHT_H] = inputs[RNN_INPUT_WEIGHT_H]; - curr->inputs[RNNCELL_INPUT_BIAS] = inputs[RNN_INPUT_BIAS]; - + curr->inputs[RNNCELL_INPUT_BIAS_I] = inputs[RNN_INPUT_BIAS_I]; + curr->inputs[RNNCELL_INPUT_BIAS_H] = inputs[RNN_INPUT_BIAS_H]; curr->outputs[RNNCELL_OUTPUT_OUTPUT] = rnncell_out0; curr->outputs[RNNCELL_OUTPUT_H_STATE] = rnncell_out1; @@ -246,6 +266,14 @@ static vsi_bool op_setup tensor = output_tensor->t; } + if (outputs[RNN_OUTPUT_H_STATE] != NULL) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = last_step_h_state; + curr->outputs[0] = outputs[RNN_OUTPUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + } + /* concat rnncell output, the rnn's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); curr->node->nn_param.concat.axis = 2; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index b782511..46a5409 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -451,6 +451,10 @@ static _op_param_gen_t s_op_gen[] = /* CUMSUM */ NULL, /* MAXPOOLWITHARGMAX */ NULL, /* MOD */ NULL, + /* LPPOOL */ NULL, + /* SCATTER_ELEMENTS */ NULL, + /* PRE_PROCESS_YUV422 */ NULL, + /* BUCKETIZE */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c index f170bcf..95f5cc7 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c @@ -45,6 +45,8 @@ static const char* _get_dtype_name(vsi_nn_type_e type) switch(type) { case D_NONE: return "Optional"; + case D_I4: return "INT4"; + case D_U4: return "UINT4"; case D_I8: return "INT8"; case D_I16: return "INT16"; case D_I32: return "INT32"; @@ -73,6 +75,7 @@ static const char* _get_qtype_name(vsi_nn_qnt_type_e type) case VSI_NN_QNT_TYPE_NONE: return ""; case VSI_NN_QNT_TYPE_DFP: return "DFP"; case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: return "ASYM"; + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: return "SYM"; case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: return "SYMM PC"; default: VSILOGE("Unknown quant type: %d\n", type); @@ -234,14 +237,14 @@ char* generate_op_io_types_desc memset(desc, 0x00, sizeof(char) * total_sz); for(i = 0; i < inputs_num; i++) { - if(inputs[i]) { + if(inputs[i] && total_sz >= used_sz) { used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ", _get_qtype_name(inputs[i]->attr.dtype.qnt_type), _get_dtype_name(inputs[i]->attr.dtype.vx_type)); } } for(i = 0; i < outputs_num; i++) { - if(outputs[i]) { + if(outputs[i] && total_sz >= used_sz) { used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ", _get_qtype_name(outputs[i]->attr.dtype.qnt_type), _get_dtype_name(outputs[i]->attr.dtype.vx_type)); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c index 2e6b26e..6f69616 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c @@ -338,20 +338,21 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm_perchannel void * out_buffer ) { + vsi_bool status; switch( dtype ) { case I8: - vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel( - buffer, size, shape, rank, - scale, scale_size, zero_point, zero_point_size, - channel_dim, (int8_t*)out_buffer ); + status = vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel( + buffer, size, shape, rank, + scale, scale_size, zero_point, zero_point_size, + channel_dim, (int8_t*)out_buffer ); break; default: VSILOGE("Don't support convert float to symm perchannel quant %d.", dtype); - return FALSE; + status = FALSE; } - return TRUE; + return status; } /* vsi_nn_dtype_convert_float_to_quantize_symm_perchannel() */ vsi_bool vsi_nn_dtype_convert_dtype_to_float @@ -496,17 +497,18 @@ vsi_bool vsi_nn_dtype_convert_quantize_symm_perchannel_to_float float * out_buffer ) { + vsi_bool status; switch( dtype ) { case I8: - vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float( - (const int8_t*)buffer, size, shape, rank, - scale, scale_size, zero_point, zero_point_size, - channel_dim, out_buffer ); + status = vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float( + (const int8_t*)buffer, size, shape, rank, + scale, scale_size, zero_point, zero_point_size, + channel_dim, out_buffer ); break; default: VSILOGE("Don't support convert symm perchannel quant %d to float.", dtype); - return FALSE; + status = FALSE; } - return TRUE; + return status; } /* vsi_nn_dtype_convert_quantize_symm_perchannel_to_float() */ diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index 25ffab7..21c8498 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -201,6 +201,33 @@ static vsi_size_t _compute_padding return vsi_nn_max(padding, 0); } /* _compute_padding() */ +int32_t vsi_nn_get_vx_pad_mode + ( + vsi_nn_pad_mode_e mode + ) +{ + int32_t pad_mode = 0; + switch (mode) { + case VSI_NN_PAD_MODE_CONSTANT: + pad_mode = VX_PAD_CONSTANT; + break; + case VSI_NN_PAD_MODE_REPLICATE: + pad_mode = VX_PAD_REPLICATE; + break; + case VSI_NN_PAD_MODE_SYMMETRIC: + pad_mode = VX_PAD_MIRROR_SYMMETRIC; + break; + case VSI_NN_PAD_MODE_REFLECT: + pad_mode = VX_PAD_MIRROR_REFLECT; + break; + default: + VSILOGE("Wrong pad_mode value"); + break; + } + + return pad_mode; +} + uint8_t * vsi_nn_LoadBinaryData ( const char * filename, @@ -1486,3 +1513,18 @@ vsi_status vsi_nn_Unpack4bitData } return status; } /* vsi_nn_Unpack4bitData() */ + +vsi_bool vsi_nn_is_3d_tensor + ( + vsi_nn_tensor_t * tensor + ) +{ + if (3 == tensor->attr.dim_num) + { + return TRUE; + } + else + { + return FALSE; + } +} diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index cbddf2d..a8a99d4 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -63,7 +63,11 @@ static vsi_status query_hardware_caps context->config.support_stream_processor = paramExt.supportStreamProcessor; context->config.sp_exec_count = paramExt2.streamProcessorExecCount; context->config.sp_vector_depth = paramExt2.streamProcessorVectorSize; - context->config.sp_per_core_vector_depth = context->config.sp_vector_depth / context->config.sp_exec_count; + if (context->config.sp_exec_count > 0) + { + context->config.sp_per_core_vector_depth = + context->config.sp_vector_depth / context->config.sp_exec_count; + } #endif #endif @@ -130,6 +134,13 @@ static vsi_status vsi_nn_initOptions options->enable_asymi8_to_u8 = atoi(env_s); } + env_s = NULL; + options->enable_dataconvert_optimize = 1; + if (vsi_nn_getEnv("VSI_NN_ENABLE_DATACONVERT_OPTIMIZE", &env_s) && env_s) + { + options->enable_dataconvert_optimize = atoi(env_s); + } + return VSI_SUCCESS; } diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index 535f595..cf44888 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -39,6 +39,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_vdata.h" #include "utils/vsi_nn_map.h" +#include "utils/vsi_nn_dtype_util.h" #include "vsi_nn_graph_optimization.h" #include "vsi_nn_error.h" @@ -2251,3 +2252,126 @@ vsi_bool vsi_nn_IsGraphFastMode { return NULL == graph ? FALSE : graph->isAllowFastMode; } + +vsi_status vsi_nn_CopyTensorViaGraphs + ( + vsi_nn_graph_t *src_graph, + vsi_nn_tensor_id_t src_tensor_id, + vsi_nn_graph_t *dst_graph, + vsi_nn_tensor_id_t dst_tensor_id + ) +{ + vsi_status status = VSI_FAILURE; + uint8_t *data = NULL; + vsi_nn_tensor_t *src_tensor = NULL; + vsi_nn_tensor_t *dst_tensor = NULL; + vsi_size_t i; + + src_tensor = vsi_nn_GetTensor(src_graph, src_tensor_id); + TEST_CHECK_PTR(src_tensor, final); + dst_tensor = vsi_nn_GetTensor(dst_graph, dst_tensor_id); + TEST_CHECK_PTR(dst_tensor, final); + + /* Check shape and dtype */ + if(src_tensor->attr.dim_num != dst_tensor->attr.dim_num) + { + VSILOGE("The dim_num of src_tensor and dst_tensor don't match."); + return status; + } + for(i=0; iattr.dim_num; i++) + { + if(src_tensor->attr.size[i] != dst_tensor->attr.size[i]) + { + VSILOGE("The shape of src_tensor and dst_tensor don't match."); + return status; + } + } + if(vsi_nn_DtypeCompare(&src_tensor->attr.dtype, &dst_tensor->attr.dtype) == FALSE) + { + VSILOGE("The dtype of src_tensor and dst_tensor don't match."); + return status; + } + + data = vsi_nn_ConvertTensorToData(src_graph, src_tensor); + TEST_CHECK_PTR(data, final); + + status = vsi_nn_CopyDataToTensor(dst_graph, dst_tensor, data); + TEST_CHECK_STATUS(status, final); + +final: + vsi_nn_safe_free(data); + return status; +} /* vsi_nn_CopyTensorViaGraphs() */ + +vsi_status vsi_nn_ExecuteGraphLoop + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_t *max_iteration_tensor + ) +{ + int32_t i,j,loop_var_num,max_iteration; + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t *iteration_index = NULL; + vsi_nn_tensor_t *iteration_cond_out = NULL; + uint8_t *data = NULL; + int8_t cond = 0; + vsi_size_t sz = 0; + + sz = vsi_nn_ShapeProduct(max_iteration_tensor->attr.size, max_iteration_tensor->attr.dim_num); + if(1 != sz) // it's shape should be 1. + { + VSILOGE("Invalid max_iteration_tensor."); + return status; + } + + loop_var_num = graph->input.num - 2; + iteration_index = vsi_nn_GetTensor(graph, graph->input.tensors[0]); + iteration_cond_out = vsi_nn_GetTensor(graph, graph->output.tensors[0]); + + data = vsi_nn_ConvertTensorToData(NULL, max_iteration_tensor); + TEST_CHECK_PTR(data, final); + max_iteration = ((int32_t *)data)[0]; + vsi_nn_safe_free(data); + + for(i=0; ioutput.tensors[0], + graph, graph->input.tensors[1] + ); + TEST_CHECK_STATUS(status, final); + for(j=0; joutput.tensors[j + 1], + graph, graph->input.tensors[j + 2] + ); + TEST_CHECK_STATUS(status, final); + } + } + +final: + vsi_nn_safe_free(data); + return status; +} /* vsi_nn_ExecuteGraphLoop() */ diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c index 7a0d809..855189b 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -574,7 +574,7 @@ static vx_tensor _create_const_raw_tensor if( TRUE == attr.is_created_from_handle ) { - vx_tensor_addressing addr; + vx_tensor_addressing addr = NULL; vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; vsi_size_t buf_sz; @@ -649,7 +649,15 @@ static vx_tensor _create_const_raw_tensor addr, data, VX_MEMORY_TYPE_HOST); #endif //memset(data, 0x5A, buf_sz); - vxReleaseTensorAddressing( &addr ); + if (addr) + { + vxReleaseTensorAddressing( &addr ); + } + if ( NULL == tensor ) + { + VSILOGE( "Create vx tensor fail." ); + goto final; + } vxFlushHandle( (vx_reference)tensor ); } } @@ -664,6 +672,8 @@ static vx_tensor _create_const_raw_tensor tensor = vxCreateVirtualTensor2( graph->g, ¶ms, sizeof( vx_tensor_create_params_t ) ); } + +final: if( NULL == tensor ) { VSILOGE( "Create vx tensor fail." ); diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c index 1845bc7..0c870bc 100644 --- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -198,6 +198,8 @@ static _node_template s_template[] = /* CUMSUM */ NULL, /* MAXPOOLWITHARGMAX */ NULL, /* MOD */ NULL, + /* LPPOOL */ NULL, + /* PRE_PROCESS_YUV422 */ NULL, }; //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index 54236c0..c931dd6 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -433,7 +433,7 @@ static vsi_bool _init_tensor #endif if( TRUE == tensor->attr.is_created_from_handle ) { - vx_tensor_addressing addr; + vx_tensor_addressing addr = NULL; vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; vsi_size_t buf_sz; @@ -529,7 +529,16 @@ static vsi_bool _init_tensor #endif //memset(data, 0x5A, buf_sz); - vxReleaseTensorAddressing( &addr ); + if (addr) + { + vxReleaseTensorAddressing( &addr ); + } + + if ( NULL == tensor->t ) + { + ret = FALSE; + goto final; + } vxFlushHandle( (vx_reference)tensor->t ); } } @@ -544,10 +553,11 @@ static vsi_bool _init_tensor tensor->t = vxCreateVirtualTensor2( graph->g, ¶ms, sizeof( vx_tensor_create_params_t ) ); } - if( NULL == tensor->t ) + if ( NULL == tensor->t ) { VSILOGE( "Create vx tensor fail." ); ret = FALSE; + goto final; } if( !tensor->attr.vtl && !tensor->attr.is_const ) @@ -565,6 +575,7 @@ static vsi_bool _init_tensor ret = _try_set_const_tensor( tensor ); +final: if( scales ) { free(scales); @@ -1243,6 +1254,11 @@ void vsi_nn_SaveTensorToTextByFp32 count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count, "%f%s", write_data, seperator ); + if ( count > _TENSOR_TMPBUF_SZ ) + { + VSILOGW( "tensor buffer overflow!" ); + break; + } if( ((float)count / _TENSOR_TMPBUF_SZ) > c_flush_th ) { fwrite( buf, count, 1, fp ); @@ -1335,11 +1351,21 @@ void vsi_nn_SaveDataToText { count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count, "%d%s", (int32_t)write_data, seperator ); + if ( count > _TENSOR_TMPBUF_SZ ) + { + VSILOGW( "tensor buffer overflow!" ); + break; + } } else { count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count, "%f%s", write_data, seperator ); + if ( count > _TENSOR_TMPBUF_SZ ) + { + VSILOGW( "tensor buffer overflow!" ); + break; + } } if( ((float) count / _TENSOR_TMPBUF_SZ ) > c_flush_th ) { @@ -1358,8 +1384,8 @@ void vsi_nn_SaveTensorToBinary const char * filename ) { - uint8_t * data; - FILE * fp; + uint8_t * data = NULL; + FILE * fp = NULL; vsi_size_t sz; uint32_t i; uint8_t * packed_data = NULL; @@ -1391,6 +1417,12 @@ void vsi_nn_SaveTensorToBinary packed_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type); packed_data = (uint8_t*)malloc(packed_size); + if ( NULL == packed_data ) + { + VSILOGW( "malloc packed data failed" ); + goto final; + } + vsi_nn_Pack4bitData(tensor, data, packed_data); fwrite( packed_data, packed_size, 1, fp ); if( packed_data ) @@ -1407,9 +1439,14 @@ void vsi_nn_SaveTensorToBinary } fwrite( data, sz, 1, fp ); } - fclose( fp ); + final: + if (fp) + { + fclose( fp ); + } vsi_nn_safe_free( data ); + vsi_nn_safe_free( packed_data ); } /* vsi_nn_SaveTensorToBinary() */ vsi_nn_tensor_t * vsi_nn_CreateTensorFromData