From 2d9e614a0625ae041a16abf9024507982c3f78cd Mon Sep 17 00:00:00 2001 From: Chen Feiyue <69809761+chenfeiyue-cfy@users.noreply.github.com> Date: Wed, 3 Jan 2024 13:13:15 +0800 Subject: [PATCH] Update internal ovxlib to rel/1.2.2 (#674) Update to SHA:806fcd6a69d333e62508acf0a6aa2c38c8385eae Type: Code Improvement Signed-off-by: Feiyue Chen --- src/tim/vx/internal/.gitignore | 3 + src/tim/vx/internal/include/interface/ops.def | 2 + .../include/kernel/vsi_nn_kernel_lut.h | 1 + .../vsi_nn_op_bidirectional_sequence_lstm.h | 11 + .../include/ops/vsi_nn_op_crop_and_resize.h | 47 + .../include/ops/vsi_nn_op_lstm_ovxlib.h | 5 + .../include/ops/vsi_nn_op_lstmunit_ovxlib.h | 5 + .../internal/include/ops/vsi_nn_op_resize.h | 3 +- .../include/ops/vsi_nn_op_scatter_nd_update.h | 1 + .../vx/internal/include/utils/vsi_nn_util.h | 6 + .../vx/internal/include/vip/virtual_device.h | 1 + src/tim/vx/internal/include/vsi_nn_context.h | 2 + .../internal/include/vsi_nn_feature_config.h | 33 + src/tim/vx/internal/include/vsi_nn_graph.h | 37 + src/tim/vx/internal/include/vsi_nn_node.h | 16 + .../vx/internal/include/vsi_nn_node_type.h | 2 + src/tim/vx/internal/include/vsi_nn_platform.h | 3 + .../include/vsi_nn_pre_post_process.h | 2 + src/tim/vx/internal/include/vsi_nn_pub.h | 5 + .../vx/internal/include/vsi_nn_tensor_util.h | 76 + src/tim/vx/internal/include/vsi_nn_types.h | 14 +- src/tim/vx/internal/include/vsi_nn_version.h | 4 +- src/tim/vx/internal/src/Android.mk | 20 +- .../internal/src/kernel/cl/comparisons_cl.c | 12 + .../src/kernel/cl/crop_and_resize_cl.c | 359 + .../src/kernel/cl/depth2space_internal_cl.c | 4 +- .../src/kernel/cl/detect_post_box_cl.c | 300 - .../src/kernel/cl/detect_post_nms_cl.c | 197 - .../internal/src/kernel/cl/eltwise_unary_cl.c | 11 + src/tim/vx/internal/src/kernel/cl/gather_cl.c | 3 +- .../src/kernel/cl/grucell_activation_z_h_cl.c | 12 + .../cl/grucell_reset_after_activation_cl.c | 4 + .../src/kernel/cl/layer_normalization_cl.c | 3 +- .../internal/src/kernel/cl/log_softmax_cl.c | 198 +- .../vx/internal/src/kernel/cl/matrixmul_cl.c | 73 +- .../vx/internal/src/kernel/cl/maximum_cl.c | 67 +- .../vx/internal/src/kernel/cl/minimum_cl.c | 66 +- src/tim/vx/internal/src/kernel/cl/pow_cl.c | 3 +- .../internal/src/kernel/cl/resize_cubic_cl.c | 320 + .../cl/scatter_nd_update_reduction_cl.c | 727 +++ src/tim/vx/internal/src/kernel/cl/select_cl.c | 2 + src/tim/vx/internal/src/kernel/cl/tile_cl.c | 2 + src/tim/vx/internal/src/kernel/cl/topk_cl.c | 2 +- .../src/kernel/evis/add_mean_std_norm_evis.c | 98 +- .../src/kernel/evis/batchnorm_single_evis.c | 38 +- .../kernel/evis/bilinear_grid_sample_evis.c | 83 +- .../vx/internal/src/kernel/evis/clip_evis.c | 1 - .../src/kernel/evis/comparisons_evis.c | 41 +- .../src/kernel/evis/conv1d_ovxlib_evis.c | 23 +- .../src/kernel/evis/crop_and_resize_evis.c | 540 ++ .../vx/internal/src/kernel/evis/cumsum_evis.c | 36 +- .../kernel/evis/depth2space_internal_evis.c | 53 +- .../src/kernel/evis/depthwise_conv1d_evis.c | 10 +- .../src/kernel/evis/detect_post_box_evis.c | 15 +- .../src/kernel/evis/eltwise_unary_evis.c | 57 +- .../vx/internal/src/kernel/evis/erf_evis.c | 39 +- .../internal/src/kernel/evis/floordiv_evis.c | 62 +- .../vx/internal/src/kernel/evis/gather_evis.c | 101 +- .../internal/src/kernel/evis/gather_nd_evis.c | 37 +- .../src/kernel/evis/grucell_activation_evis.c | 74 +- .../kernel/evis/grucell_activation_z_h_evis.c | 37 +- .../evis/grucell_h_times_activation_r_evis.c | 15 +- .../grucell_reset_after_activation_evis.c | 36 +- .../src/kernel/evis/l2normalizescale_evis.c | 40 +- .../kernel/evis/layer_normalization_evis.c | 64 +- .../src/kernel/evis/log_softmax_evis.c | 610 +- .../kernel/evis/lstmunit_activation_evis.c | 43 +- .../internal/src/kernel/evis/matrixmul_evis.c | 133 +- .../internal/src/kernel/evis/maximum_evis.c | 107 +- .../internal/src/kernel/evis/minimum_evis.c | 107 +- .../vx/internal/src/kernel/evis/mod_evis.c | 62 +- .../internal/src/kernel/evis/moments_evis.c | 76 +- .../internal/src/kernel/evis/one_hot_evis.c | 7 +- .../src/kernel/evis/poolwithargmax_evis.c | 30 +- .../vx/internal/src/kernel/evis/pow_evis.c | 67 +- .../src/kernel/evis/pre_process_bgra_evis.c | 24 +- .../src/kernel/evis/pre_process_gray_evis.c | 54 +- .../kernel/evis/pre_process_nv12_rggb_evis.c | 884 +++ .../evis/pre_process_rgb888_planar_evis.c | 23 +- .../pre_process_rgb888_planar_nhwc_evis.c | 22 +- .../src/kernel/evis/pre_process_rgb_evis.c | 24 +- .../src/kernel/evis/pre_process_yuv444_evis.c | 45 +- .../vx/internal/src/kernel/evis/prelu_evis.c | 40 +- .../src/kernel/evis/reducemax_internal_evis.c | 72 +- .../src/kernel/evis/reducemin_internal_evis.c | 71 +- .../kernel/evis/reduceprod_internal_evis.c | 72 +- .../src/kernel/evis/relu_keras_evis.c | 35 +- .../src/kernel/evis/resize_1d_bilinear_evis.c | 55 +- .../src/kernel/evis/resize_1d_nearest_evis.c | 47 +- .../src/kernel/evis/resize_bilinear_evis.c | 203 +- .../src/kernel/evis/resize_cubic_evis.c | 453 ++ .../src/kernel/evis/resize_nearest_evis.c | 47 +- .../src/kernel/evis/scatter_nd_evis.c | 10 +- .../src/kernel/evis/scatter_nd_update_evis.c | 8 +- .../evis/scatter_nd_update_reduction_evis.c | 861 +++ .../vx/internal/src/kernel/evis/select_evis.c | 62 +- .../src/kernel/evis/sequence_mask_evis.c | 40 +- .../vx/internal/src/kernel/evis/slice_evis.c | 41 +- .../kernel/evis/spatial_transformer_evis.c | 59 +- .../vx/internal/src/kernel/evis/swish_evis.c | 82 +- .../vx/internal/src/kernel/evis/tile_evis.c | 42 +- .../internal/src/kernel/evis/upsample_evis.c | 30 +- .../src/kernel/evis/upsamplescale_evis.c | 34 +- .../kernel/vsi_nn_kernel_gpu_shape_optimize.c | 17 +- .../internal/src/kernel/vsi_nn_kernel_lut.c | 8 + .../src/kernel/vsi_nn_kernel_selector.c | 16 +- .../internal/src/kernel/vsi_nn_kernel_util.c | 65 +- .../internal/src/kernel/vx/eltwise_unary_vx.c | 115 +- .../vx/internal/src/kernel/vx/layer_norm_vx.c | 87 + .../internal/src/kernel/vx/log_softmax_vx.c | 85 + .../ops/cl/crop_and_resize_bilinear.cl | 107 + .../cl/crop_and_resize_nearest_neighbor.cl | 77 + .../src/libnnext/ops/cl/detect_post_box.cl | 101 - .../src/libnnext/ops/cl/eltwise_unary_0.cl | 8 + .../src/libnnext/ops/cl/eltwise_unary_1.cl | 8 + .../libnnext/ops/cl/grucell_activation_z_h.cl | 27 +- .../ops/cl/grucell_reset_after_activation.cl | 11 +- .../ops/cl/log_softmax_exceed_axis0.cl | 167 + .../ops/cl/log_softmax_exceed_axis1.cl | 172 + .../src/libnnext/ops/cl/matrixmul_4x.cl | 128 + .../src/libnnext/ops/cl/resize_cubic.cl | 195 + .../ops/cl/scatter_nd_update_reduction.cl | 203 + .../cl/scatter_nd_update_reduction_conv.cl | 72 + .../vx/internal/src/libnnext/ops/cl/swish.cl | 2 +- .../ops/vx/crop_and_resize_bilinear.vx | 255 + .../vx/crop_and_resize_nearest_neighbor.vx | 292 + .../src/libnnext/ops/vx/eltwise_unary_2d_1.vx | 8 +- .../src/libnnext/ops/vx/eltwise_unary_3d_1.vx | 7 + .../vx/internal/src/libnnext/ops/vx/gather.vx | 24 +- .../src/libnnext/ops/vx/gather_array.vx | 119 +- .../src/libnnext/ops/vx/gather_batch.vx | 24 +- .../src/libnnext/ops/vx/gather_mix.vx | 18 +- .../src/libnnext/ops/vx/gather_mix_batch.vx | 18 +- .../libnnext/ops/vx/grucell_activation_z_h.vx | 15 +- .../ops/vx/grucell_reset_after_activation.vx | 12 + .../ops/vx/layer_normalization_axis01_0.vx | 315 + .../ops/vx/layer_normalization_axis01_1.vx | 317 + .../ops/vx/layer_normalization_axis01_2.vx | 348 + .../ops/vx/layer_normalization_axis01_3.vx | 178 + .../ops/vx/layer_normalization_axis01_sum.vx | 228 + .../ops/vx/log_softmax_exceed_axis0.vx | 190 + .../ops/vx/log_softmax_exceed_axis0_BF16.vx | 187 + .../ops/vx/log_softmax_exceed_axis1.vx | 172 + .../ops/vx/log_softmax_exceed_axis1_BF16.vx | 180 + .../ops/vx/pre_process_nv12_rggb_copy.vx | 111 + .../ops/vx/pre_process_nv12_rggb_scale.vx | 247 + .../libnnext/ops/vx/resize_bilinear_F16.vx | 145 +- .../libnnext/ops/vx/resize_bilinear_I16.vx | 72 +- .../src/libnnext/ops/vx/resize_bilinear_I8.vx | 60 +- .../libnnext/ops/vx/resize_bilinear_U16.vx | 278 + .../src/libnnext/ops/vx/resize_bilinear_U8.vx | 8 +- .../src/libnnext/ops/vx/resize_cubic.vx | 270 + .../ops/vx/scatter_nd_update_reduction.vx | 259 + .../vx/scatter_nd_update_reduction_conv.vx | 110 + .../libnnext/ops/vx/vsi_nn_kernel_header.vx | 52 +- .../src/libnnext/vsi_nn_libnnext_resource.c | 5761 ++++++++++++++++- .../vsi_nn_op_axis_aligned_bbox_transform.c | 62 +- .../vsi_nn_op_bidirectional_sequence_lstm.c | 29 + .../vx/internal/src/ops/vsi_nn_op_concat.c | 12 + .../src/ops/vsi_nn_op_conv2d_lstm_cell.c | 49 +- .../vx/internal/src/ops/vsi_nn_op_conv_relu.c | 298 +- .../src/ops/vsi_nn_op_conv_relu_pool.c | 254 +- .../src/ops/vsi_nn_op_crop_and_resize.c | 193 + .../src/ops/vsi_nn_op_depth2space_internal.c | 18 +- .../src/ops/vsi_nn_op_detection_postprocess.c | 141 +- .../vx/internal/src/ops/vsi_nn_op_eltwise.c | 7 +- .../src/ops/vsi_nn_op_eltwise_unary.c | 1 + .../src/ops/vsi_nn_op_fullconnect_relu.c | 313 +- .../vx/internal/src/ops/vsi_nn_op_gather.c | 7 +- .../src/ops/vsi_nn_op_generate_proposals.c | 88 +- .../src/ops/vsi_nn_op_grouped_conv1d.c | 22 +- .../src/ops/vsi_nn_op_grouped_conv2d.c | 10 + .../src/ops/vsi_nn_op_groupnormalize.c | 70 + .../src/ops/vsi_nn_op_heatmap_max_keypoint.c | 71 +- .../internal/src/ops/vsi_nn_op_imageprocess.c | 86 +- .../src/ops/vsi_nn_op_instancenormalize.c | 12 +- .../src/ops/vsi_nn_op_layernormalize.c | 13 + .../internal/src/ops/vsi_nn_op_log_softmax.c | 35 +- .../internal/src/ops/vsi_nn_op_lstm_ovxlib.c | 27 + .../src/ops/vsi_nn_op_lstmunit_ovxlib.c | 21 +- .../vx/internal/src/ops/vsi_nn_op_permute.c | 11 + .../internal/src/ops/vsi_nn_op_pre_process.c | 20 +- .../src/ops/vsi_nn_op_pre_process_nv12.c | 10 +- .../src/ops/vsi_nn_op_quantized_16bit_lstm.c | 65 +- .../vx/internal/src/ops/vsi_nn_op_reshape.c | 7 +- .../vx/internal/src/ops/vsi_nn_op_reshape2.c | 7 +- .../vx/internal/src/ops/vsi_nn_op_resize.c | 4 + .../src/ops/vsi_nn_op_scatter_nd_update.c | 27 +- .../src/ops/vsi_nn_op_strided_slice.c | 7 +- src/tim/vx/internal/src/ops/vsi_nn_op_topk.c | 80 +- .../quantization/vsi_nn_asymmetric_affine.c | 3 + .../src/utils/vsi_nn_code_generator.c | 2 + src/tim/vx/internal/src/utils/vsi_nn_util.c | 27 + .../vx/internal/src/vip/virtual_device.cpp | 4 + src/tim/vx/internal/src/vsi_nn_context.c | 99 +- src/tim/vx/internal/src/vsi_nn_graph.c | 744 ++- src/tim/vx/internal/src/vsi_nn_log.c | 30 +- src/tim/vx/internal/src/vsi_nn_node.c | 24 +- .../internal/src/vsi_nn_node_attr_template.c | 1 + .../vx/internal/src/vsi_nn_pre_post_process.c | 16 +- src/tim/vx/internal/src/vsi_nn_tensor.c | 179 +- .../vx/internal/src/vsi_nn_tensor_util_prv.h | 20 + src/tim/vx/internal/src/vsi_nn_types_prv.h | 12 + 203 files changed, 18939 insertions(+), 5096 deletions(-) create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h create mode 100644 src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c delete mode 100644 src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c delete mode 100644 src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c create mode 100644 src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c create mode 100644 src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c create mode 100644 src/tim/vx/internal/src/kernel/vx/log_softmax_vx.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_bilinear.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_nearest_neighbor.cl delete mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis0.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis1.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/resize_cubic.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction_conv.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_bilinear.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_nearest_neighbor.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_0.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_1.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_2.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_3.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_sum.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_copy.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_scale.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_cubic.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction_conv.vx create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_crop_and_resize.c diff --git a/src/tim/vx/internal/.gitignore b/src/tim/vx/internal/.gitignore index 6858186..665a34f 100644 --- a/src/tim/vx/internal/.gitignore +++ b/src/tim/vx/internal/.gitignore @@ -3,6 +3,9 @@ ## ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore +# Some header file +include/vsi_nn_feature_config.h + # User-specific files *.suo *.user diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index 0a1424e..6c879e9 100755 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -195,3 +195,5 @@ DEF_OP(GRID_SAMPLE) DEF_OP(LPNORM) DEF_OP(RESIZE_3D) DEF_OP(REDUCEL2) +DEF_OP(CROP_AND_RESIZE) +DEF_OP(TAN) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h index 8b8c055..3143b41 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h @@ -55,6 +55,7 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum VSI_NN_KERNEL_LUT_ATANH = 21, VSI_NN_KERNEL_LUT_ACOSH = 22, VSI_NN_KERNEL_LUT_INVERSE_SIGMOID = 23, + VSI_NN_KERNEL_LUT_TAN = 24, }; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h index 8a4e7cb..22bed7e 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h @@ -106,10 +106,21 @@ enum BI_LSTM_BW_INPUT_LAYERNORM_C = 54, BI_LSTM_BW_INPUT_LAYERNORM_O = 55, + BI_LSTM_FW_INPUT_BIAS_R2I = 56, + BI_LSTM_FW_INPUT_BIAS_R2F = 57, + BI_LSTM_FW_INPUT_BIAS_R2C = 58, + BI_LSTM_FW_INPUT_BIAS_R2O = 59, + + BI_LSTM_BW_INPUT_BIAS_R2I = 60, + BI_LSTM_BW_INPUT_BIAS_R2F = 61, + BI_LSTM_BW_INPUT_BIAS_R2C = 62, + BI_LSTM_BW_INPUT_BIAS_R2O = 63, + BI_LSTM_INPUT_CNT, BI_LSTM_FW_OUTPUT_OUTPUT = 0, BI_LSTM_BW_OUTPUT_OUTPUT = 1, + BI_LSTM_OUTPUT_CNT }; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h new file mode 100644 index 0000000..aa12459 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_CROP_AND_RESIZE_H +#define _VSI_NN_OP_CROP_AND_RESIZE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_crop_and_resize_param +{ + struct _crop_and_resize_local_data_t * lcl_data; + const int32_t* crop_size; + vsi_enum resize_method; + float extrapolation_value; +} vsi_nn_crop_and_resize_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h index 29c8cd1..19c45a1 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h @@ -70,6 +70,11 @@ enum LSTM_INPUT_AUX_WEIGHT_I2C = 27, LSTM_INPUT_AUX_WEIGHT_I2O = 28, + LSTM_INPUT_BIAS_R2I = 29, + LSTM_INPUT_BIAS_R2F = 30, + LSTM_INPUT_BIAS_R2C = 31, + LSTM_INPUT_BIAS_R2O = 32, + LSTM_INPUT_CNT, LSTM_OUTPUT_OUTPUT = 0, diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h index cc53d4c..bc23d65 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h @@ -74,6 +74,11 @@ enum LSTMUNIT_INPUT_AUX_WEIGHT_I2C = 27, LSTMUNIT_INPUT_AUX_WEIGHT_I2O = 28, + LSTMUNIT_INPUT_BIAS_R2I = 29, + LSTMUNIT_INPUT_BIAS_R2F = 30, + LSTMUNIT_INPUT_BIAS_R2C = 31, + LSTMUNIT_INPUT_BIAS_R2O = 32, + LSTMUNIT_INPUT_CNT, LSTMUNIT_OUTPUT_OUTPUT = 0, diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h index aaa72c6..b8d19d5 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h @@ -38,7 +38,8 @@ typedef uint32_t vsi_nn_interpolation_type_t; enum { VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR = 0, VSI_NN_INTERPOLATION_BILINEAR, - VSI_NN_INTERPOLATION_AREA + VSI_NN_INTERPOLATION_AREA, + VSI_NN_INTERPOLATION_CUBIC }; typedef uint32_t vsi_nn_resize_layout_type_t; enum diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h index 68e1b29..7121b3b 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h @@ -33,6 +33,7 @@ extern "C" { typedef struct _vsi_nn_scatter_nd_update_param { vsi_bool use_locking; + vsi_nn_reduction_type_e reduction; } vsi_nn_scatter_nd_update_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h index 128e7d0..007983c 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -471,6 +471,12 @@ char* vsi_nn_getenv const char * var_name ); +int32_t vsi_nn_getenv_asint + ( + const char* env, + int32_t default_value + ); + FILE* vsi_nn_fopen ( const char * file_name, diff --git a/src/tim/vx/internal/include/vip/virtual_device.h b/src/tim/vx/internal/include/vip/virtual_device.h index a91ef83..4d138c6 100644 --- a/src/tim/vx/internal/include/vip/virtual_device.h +++ b/src/tim/vx/internal/include/vip/virtual_device.h @@ -43,6 +43,7 @@ class IDevice { OVXLIB_API IDevice(uint32_t id); OVXLIB_API ~IDevice(); OVXLIB_API uint32_t Id() const; + OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, bool (*func)(const void*), data_t data); OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data); OVXLIB_API bool GraphRemove(const vsi_nn_graph_t* graph); OVXLIB_API bool ThreadExit(); diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index 777cf5c..4ac9f61 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -79,6 +79,8 @@ typedef struct _vsi_nn_runtime_option_t int32_t enable_dataconvert_optimize; int32_t enable_stream_processor; int32_t enable_rgb88_planar_nhwc; + int32_t enable_slice_optimize; + int32_t enable_batch_opt; } vsi_nn_runtime_option_t; /** diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h index e93d1af..7918ae3 100644 --- a/src/tim/vx/internal/include/vsi_nn_feature_config.h +++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h @@ -1,3 +1,26 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the Software), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ /*****Auto generated header file, Please DO NOT modify manually!*****/ #ifndef _VSI_NN_FEATURE_CONFIG_H #define _VSI_NN_FEATURE_CONFIG_H @@ -20,5 +43,15 @@ #define VSI_CONCAT_ENHANCE_SUPPORT #endif #define VSI_CREATE_TENSOR_FROM_VIEW_SUPPORT +#ifndef VSI_SWAP_HANDLE_CACHE_SUPPORT +#define VSI_SWAP_HANDLE_CACHE_SUPPORT +#endif +#define VSI_EXPORT_APIS_FOR_SETUP_GRAPH 1 +#if defined(VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT) && VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT +#define VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT +#endif +#if defined(VX_13_NN_COMPATIBLITY) +#define VSI_MAP_TENSOR_PATCH_SUPPORT +#endif #endif diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h index 4053988..89786c4 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph.h +++ b/src/tim/vx/internal/include/vsi_nn_graph.h @@ -382,6 +382,31 @@ OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromView vsi_size_t* end ); +/** + * Add a new tensor from AXI-SRAM + * Create a new tensor from internal AXI-SRAM and add it to graph. + * It just creates the tensor object and does not actually allocate the memory + * in AXI-SRAM until the verify graph stage. In the other words, the tensor object is + * created beforehand,but the memory for storing its data is not allocate until verify + * graph stage. AXI-SRAM is the internal memory resource that memory allocation is done + * strategically to optimize performance and resource usage in graph verification. + * If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE + * User can't access the tensor memory(read/write tensor data) before the graph has verified, + * since the tensor memory is not allocated. + * @param[in] graph Graph handle + * @param[in] id Optional id to the tensor, set it to VSI_NN_TENSOR_ID_AUTO, + * and a new id will be generated. + * @param[in] attr Tensor attirbutes to the new tensor. + * + * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse. + */ +OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromAXISRAM + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id, + vsi_nn_tensor_attr_t * attr + ); + /** * Attach tensor to graph * Attach an exist tensor to graph. @@ -796,6 +821,18 @@ OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption size_t size ); +/** + * graph shape inference + * + * @param[in] graph Graph handle + * + * @return VSI_SUCCESS on success, or appropriate error code otherwise + * */ +OVXLIB_API vsi_status vsi_nn_InferShape +( + vsi_nn_graph_t* graph +); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_node.h b/src/tim/vx/internal/include/vsi_nn_node.h index 0a69dbd..3756c7c 100644 --- a/src/tim/vx/internal/include/vsi_nn_node.h +++ b/src/tim/vx/internal/include/vsi_nn_node.h @@ -155,6 +155,22 @@ OVXLIB_API void vsi_nn_PrintNode vsi_nn_node_id_t id ); +#if VX_GRAPH_BATCH_OPT_SUPPORT +/** + * Set how much this node is divided into on batch dim. + * + * @param[in] node Node. + * @param[in] split_num. + * + * @return VSI_SUCCESS on success, or error core otherwise. + */ +OVXLIB_API vsi_status vsi_nn_SetNodeBatchSplitNum +( + vsi_nn_node_t* node, + int8_t split_num +); +#endif + /** * Update node attribute * Update openvx node attribute based on ovxlib's node attribute diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index f961835..173be94 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -209,6 +209,7 @@ #include "ops/vsi_nn_op_lpnorm.h" #include "ops/vsi_nn_op_resize_3d.h" #include "ops/vsi_nn_op_reducel2.h" +#include "ops/vsi_nn_op_crop_and_resize.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" #include "ops/vsi_nn_op_inverse_sigmoid.h" @@ -406,6 +407,7 @@ typedef union _vsi_nn_nn_param vsi_nn_lpnorm_param lpnorm; vsi_nn_resize_3d_param resize_3d; vsi_nn_reducel2_param reducel2; + vsi_nn_crop_and_resize_param crop_and_resize; void* client_param; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_platform.h b/src/tim/vx/internal/include/vsi_nn_platform.h index f5548c8..077c148 100644 --- a/src/tim/vx/internal/include/vsi_nn_platform.h +++ b/src/tim/vx/internal/include/vsi_nn_platform.h @@ -35,6 +35,9 @@ #if defined(VX_KHR_COMPATIBILITY) && (0x1==VX_KHR_COMPATIBILITY) #include #endif +#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT +#include +#endif /* This is a compatibility head file for backward compatibility OpenVX 1.1 spec diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h index 9cfae60..6832d86 100644 --- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h +++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h @@ -89,6 +89,8 @@ typedef enum VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422, VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422, VSI_NN_SOURCE_FORMAT_IMAGE_NV21, + VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB, + VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR, } vsi_nn_preprocess_source_format_e; /** diff --git a/src/tim/vx/internal/include/vsi_nn_pub.h b/src/tim/vx/internal/include/vsi_nn_pub.h index 48525a4..1b48062 100644 --- a/src/tim/vx/internal/include/vsi_nn_pub.h +++ b/src/tim/vx/internal/include/vsi_nn_pub.h @@ -54,5 +54,10 @@ #include "utils/vsi_nn_dtype_util.h" #include "quantization/vsi_nn_asymmetric_affine.h" #include "quantization/vsi_nn_dynamic_fixed_point.h" + +#if defined(VSI_ENABLE_LCOV_TEST) && VSI_ENABLE_LCOV_TEST +#include "lcov/vsi_nn_coverage.h" +#endif + #endif diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h index 3441489..4c88f95 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h @@ -817,6 +817,82 @@ vsi_nn_tensor_t * vsi_nn_dropout_tensor float rate ); +/** + * Allows the application to get direct access to a patch of tensor object. + * A wrapper api for OpenVX vxMapTensorPatch + * + * @param[in] graph Graph handle. + * @param[in] tensor Tensor handle. + * @param[out] ptr The address of a pointer that the function sets to the + * address where the requested data can be accessed. The returned (*ptr) address + * is only valid between the call to the function and the corresponding call to + * vsi_nn_UnmapTensorPatch. + * @param [in] usage This declares the access mode for the tensor patch, using + * the vsi_nn_accessor_type_e enumeration. + * VSI_NN_READ_ONLY: after the function call, the content of the memory location + * pointed by (*ptr) contains the tensor patch data. Writing into this memory location + * is forbidden and its behavior is undefined. + * VSI_NN_READ_AND_WRITE : after the function call, the content of the memory + * location pointed by (*ptr) contains the tensor patch data; writing into this memory + * is allowed only for the location of items and will result in a modification of the + * affected items in the tensor object once the range is unmapped. Writing into + * a gap between items (when (*stride) > item size in bytes) is forbidden and its + * behavior is undefined. + * VSI_NN_WRITE_ONLY: after the function call, the memory location pointed by (*ptr) + * contains undefined data; writing each item of the range is required prior to + * unmapping. Items not written by the application before unmap will become + * undefined after unmap, even if they were well defined before map. Like for + * VSI_NN_READ_AND_WRITE, writing into a gap between items is forbidden and its behavior + * is undefined. + * @return VSI_SUCCESS on success, or error core otherwise. + */ + +OVXLIB_API vsi_status vsi_nn_MapTensorPatch + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor, + void** ptr, + vsi_nn_accessor_type_e usage + ); + +/** + * Unmap and commit potential changes to a tensor object patch that was previously mapped. + * Unmapping a tensor patch invalidates the memory location from which the patch could + * be accessed by the application. Accessing this memory location after the unmap function + * completes has an undefined behavior. + * @param[in] graph Graph handle. + * @param [in] tensor The reference to the tensor object to unmap. + * return VSI_SUCCESS on success, or error core otherwise. + */ + +OVXLIB_API vsi_status vsi_nn_UnmapTensorPatch + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor + ); + +/** + * Create a new tensor from internal AXI-SRAM(Kernel driver maped) + * It just creates the tensor object and does not actually allocate the memory + * in AXI-SRAM until the verify graph stage. In the other words, the tensor + * object is created beforehand,but the memory for storing its data is not + * allocate until verify graph stage. AXI-SRAM is the internal memory resource + * that memory allocation is done strategically to optimize performance and + * resource usage in graph verification. + * If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE + * User can't access the tensor memory(read/write tensor data) before the graph has verified, + * since the tensor memory is not allocated. + * @param[in] graph Graph handle + * @param[in] attr Tensor attirbutes to the new tensor. + * + * @return Tensor handle on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateTensorFromAXISRAM + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_attr_t * attr + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h index 380057b..4e0b58b 100644 --- a/src/tim/vx/internal/include/vsi_nn_types.h +++ b/src/tim/vx/internal/include/vsi_nn_types.h @@ -115,7 +115,9 @@ typedef enum { VSI_NN_REDUCTION_TYPE_NONE, VSI_NN_REDUCTION_TYPE_ADD, - VSI_NN_REDUCTION_TYPE_MUL + VSI_NN_REDUCTION_TYPE_MUL, + VSI_NN_REDUCTION_TYPE_MAX, + VSI_NN_REDUCTION_TYPE_MIN } vsi_nn_reduction_type_e; /** Pad mode enum */ @@ -269,7 +271,9 @@ typedef enum _vsi_nn_yuv_type typedef enum _vsi_nn_nv_type { VSI_NN_YUV_TYPE_NV12, - VSI_NN_YUV_TYPE_NV21 + VSI_NN_YUV_TYPE_NV21, + VSI_NN_YUV_TYPE_NV12_RGGB, + VSI_NN_YUV_TYPE_NV21_BGGR }vsi_nn_nv_type; typedef enum _vsi_nn_roi_align_type_e @@ -283,6 +287,12 @@ typedef enum _vsi_nn_custom_warp_affine_type_e { VSI_NN_WARP_AFFINE_TYPE_RGB } vsi_nn_custom_warp_affine_type_e; +typedef enum _vsi_nn_accessor_type_e { + VSI_NN_READ_ONLY = VX_READ_ONLY, + VSI_NN_WRITE_ONLY = VX_WRITE_ONLY, + VSI_NN_READ_AND_WRITE = VX_READ_AND_WRITE +} vsi_nn_accessor_type_e; + /** Deprecated */ typedef uint32_t vsi_nn_size_t; diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 97fd959..2b7e1bd 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -32,8 +32,8 @@ extern "C"{ #endif #define VSI_NN_VERSION_MAJOR 1 -#define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 88 +#define VSI_NN_VERSION_MINOR 2 +#define VSI_NN_VERSION_PATCH 2 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/Android.mk b/src/tim/vx/internal/src/Android.mk index a1b3683..6c425f6 100644 --- a/src/tim/vx/internal/src/Android.mk +++ b/src/tim/vx/internal/src/Android.mk @@ -14,6 +14,10 @@ ifeq ($(PLATFORM_VENDOR),1) LOCAL_VENDOR_MODULE := true endif +$(info Remove $(LOCAL_PATH)/../include/vsi_nn_feature_config.h ...) +$(shell rm $(LOCAL_PATH)/../include/vsi_nn_feature_config.h -rf) +$(info $(shell bash $(LOCAL_PATH)/../gcc_gen_feature_config_header.sh $(LOCAL_PATH)/..)) + LOCAL_SRC_FILES := \ vsi_nn_context.c \ vsi_nn_client_op.c \ @@ -59,12 +63,6 @@ LOCAL_SRC_FILES += \ post/vsi_nn_post_fasterrcnn.c \ post/vsi_nn_post_cmupose.c -LOCAL_SRC_FILES += \ - cpu_backend/vsi_nn_cpu_backend.c \ - cpu_backend/vsi_nn_cpu_backend_conv2d.c \ - cpu_backend/vsi_nn_cpu_backend_deconv2d.c \ - cpu_backend/npuref_interface.c - LOCAL_SRC_FILES += libnnext/vsi_nn_libnnext_resource.c \ libnnext/vsi_nn_vxkernel.c @@ -78,11 +76,10 @@ LOCAL_SRC_FILES += kernel/vsi_nn_kernel.c \ kernel/vsi_nn_kernel_param.c \ kernel/vsi_nn_kernel_gpu_shape_optimize.c \ kernel/vsi_nn_kernel_lut.c \ - kernel/vsi_nn_spinst.c \ - kernel/vsi_nn_sp_unit_operation.c \ - kernel/vsi_nn_sp_lut.c \ kernel/vsi_nn_gpu.c +LOCAL_SRC_FILES += vip/virtual_device.cpp + LIBNNEXT_KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/libnnext/ops/kernel/*.c) LOCAL_SRC_FILES += $(LIBNNEXT_KERNEL_SOURCES:$(LOCAL_PATH)/%=%) @@ -117,13 +114,14 @@ LOCAL_C_INCLUDES += \ $(AQROOT)/sdk/inc/ \ $(AQROOT)/sdk/inc/HAL \ $(LOCAL_PATH)/../include \ + $(LOCAL_PATH)/../include/vip \ $(LOCAL_PATH)/../include/ops \ $(LOCAL_PATH)/../include/utils \ $(LOCAL_PATH)/../include/infernce \ $(LOCAL_PATH)/../include/client \ - $(LOCAL_PATH)/../include/cpu_backend \ $(LOCAL_PATH)/../include/libnnext \ - $(LOCAL_PATH)/../src + $(LOCAL_PATH)/../src \ + $(LOCAL_PATH)/../src/vip LOCAL_CFLAGS := \ -DLINUX \ diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c index 4b1369f..7ad273d 100644 --- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) #include #include #include @@ -293,6 +294,16 @@ static vsi_status _query_kernel input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input0_dtype == I16) + { + input0_dtype = I32; + } + + if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input1_dtype == I16) + { + input1_dtype = I32; + } + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8) { output_dtype = BOOL8; @@ -452,3 +463,4 @@ final: REGISTER_BACKEND_CL( relational_ops, _setup ) __END_DECLS +#endif diff --git a/src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c b/src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c new file mode 100644 index 0000000..4adcbce --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c @@ -0,0 +1,359 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +typedef enum _crop_and_resize_type_e +{ + nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR, + bilinear = VSI_NN_INTERPOLATION_BILINEAR, +}crop_and_resize_type_e; + +#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME "crop_and_resize_" + +// Add kernel hashtable here +#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \ + (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD)) +#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \ + { CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \ + CVIVANTE_NAMESPACE("cl.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \ + _CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _crop_and_resize_kernel_map[] = +{ + // Register kernel here + CROP_AND_RESIZE_KERNEL( U32, U32, nearest_neighbor ), + CROP_AND_RESIZE_KERNEL( U32, F32, nearest_neighbor ), + CROP_AND_RESIZE_KERNEL( F32, F32, nearest_neighbor), + CROP_AND_RESIZE_KERNEL( F32, U32, nearest_neighbor ), + CROP_AND_RESIZE_KERNEL( F32, I32, nearest_neighbor), + CROP_AND_RESIZE_KERNEL( I32, I32, nearest_neighbor ), + CROP_AND_RESIZE_KERNEL( I32, F32, nearest_neighbor), + + CROP_AND_RESIZE_KERNEL( U32, U32, bilinear), + CROP_AND_RESIZE_KERNEL( U32, F32, bilinear), + CROP_AND_RESIZE_KERNEL( F32, F32, bilinear), + CROP_AND_RESIZE_KERNEL( F32, U32, bilinear), + CROP_AND_RESIZE_KERNEL( F32, I32, bilinear), + CROP_AND_RESIZE_KERNEL( I32, I32, bilinear), + CROP_AND_RESIZE_KERNEL( I32, F32, bilinear), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _crop_and_resize_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CROP_AND_RESIZE_PARAM_NUM _cnt_of_array( _crop_and_resize_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_crop_and_resize_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + }; + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + int32_t crop_width = 0; + int32_t crop_height = 0; + int32_t image_width = 0; + int32_t image_height = 0; + int32_t batch_out = 0; + float width_scale = 0; + float height_scale = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out); + CHECK_STATUS_FAIL_GOTO(status, final ); + + image_width = (int32_t)(attr[0]->shape->data[0]); + image_height = (int32_t)(attr[0]->shape->data[1]); + crop_width = (int32_t)(attr[1]->shape->data[0]); + crop_height = (int32_t)(attr[1]->shape->data[1]); + + width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0; + height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = (crop_width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]; + gpu_param.global_size[1] = (crop_height + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = (batch_out + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + + status = vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width ); + status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _crop_and_resize_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t resize_method + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map; + size_t kernel_map_size = _cnt_of_array( _crop_and_resize_kernel_map ); + vx_param_description_t * param_def = _crop_and_resize_kernel_param_def; + vx_kernel_initialize_f initializer = _crop_and_resize_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + else if (U8 == in_dtype) + { + in_dtype = U32; + } + else if (I8 == in_dtype || I16 == in_dtype) + { + in_dtype = I32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + else if (U8 == out_dtype) + { + out_dtype = U32; + } + else if (I8 == out_dtype || I16 == out_dtype) + { + out_dtype = I32; + } + + key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _crop_and_resize_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2]; + uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3]; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = vsi_nn_get_tensor_scale(outputs[0]); + float inOutScale = input_scale / output_scale; + float inOutTile = output_zp - inOutScale * input_zp; + + float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" ); + int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" ); + + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + shapes[0][0] = inputs[0]->attr.size[0]; + shapes[0][1] = inputs[0]->attr.size[1]; + shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3]; + + shapes[1][0] = outputs[0]->attr.size[0]; + shapes[1][1] = outputs[0]->attr.size[1]; + shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3]; + + rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 ); + rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 ); + + if (rs_input == NULL || rs_output == NULL) + { + goto final; + } + + status = _query_kernel( kernel, inputs, outputs, resize_method ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + node_params[0] = rs_input; + node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t); + node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t); + node_params[3] = rs_output; + node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth ); + node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout ); + node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale ); + node_params[7] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile ); + node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &extrapolation_value ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + } + } +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( crop_and_resize, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c index 94e79fe..d24dbde 100644 --- a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c @@ -22,7 +22,7 @@ * *****************************************************************************/ - +#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT) #include #include #include @@ -228,4 +228,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( depth2space_internal, _setup ) - +#endif diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c deleted file mode 100644 index 596aab5..0000000 --- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c +++ /dev/null @@ -1,300 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - - -#include -#include -#include -#include "vsi_nn_types.h" -#include "vsi_nn_tensor.h" -#include "vsi_nn_graph.h" -#include "vsi_nn_log.h" -#include "vsi_nn_error.h" -#include "vsi_nn_prv.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "kernel/vsi_nn_kernel.h" - -__BEGIN_DECLS - -/* - * Define kernel meta. - */ -typedef enum -{ - INTERNAL_KERNEL_DETECT_POST_BOX, -} _internal_kernel_e; - -#define _DETECT_POST_BOX_KERNEL_SOURCE "detect_post_box" - -#define STR(a) #a -// Add kernel hashtable here -#define DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ - ((IN0_DTYPE << 18) | ( IN1_DTYPE << 11 ) | ( OUT_DTYPE << 4)) - -#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ - { DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \ - CVIVANTE_NAMESPACE("cl.detect_post_box_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ - _DETECT_POST_BOX_KERNEL_SOURCE} - -typedef struct -{ - uint32_t key; - char * function_name; - const char * source_name; -} _kernel_map_type; - -static const _kernel_map_type _detect_post_box_kernel_map[] = -{ - // Register kernel here - PACK_KERNEL_MAP( F32, F32, F32 ), - PACK_KERNEL_MAP( U8, U8, F32 ), -}; - - -/* - * Kernel params - */ -static vx_param_description_t _detect_post_box_kernel_param_def[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, -}; -#define _DETECT_POST_BOX_PARAM_NUM _cnt_of_array( _detect_post_box_kernel_param_def ) - -#define _DETECT_POST_BOX_F32_PARAM_NUM 8 - -#define SCALAR_SCALE_Y (3) -#define SCALAR_SCALE_X (4) -#define SCALAR_SCALE_H (5) -#define SCALAR_SCALE_W (6) -#define SCALAR_LOG_E (7) -#define SCALAR_TAIL0 (8) -#define SCALAR_TAIL1 (9) -#define SCALAR_SCALE0 (10) -#define SCALAR_SCALE1 (11) - -/* - * Kernel initializer - */ -DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) - ( - vsi_nn_kernel_node_t node, - const vsi_nn_kernel_node_param_t * param, - size_t param_size - ) -{ - vsi_status status = VSI_FAILURE; - gpu_param_t gpu_param = { - 3, - {0, 0, 0}, - {0, 0, 0}, - {0, 0, 0}, - {0, 0, 0} - }; - vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_size_array_t * in_shape = NULL; - - VSI_UNREFERENCED(param_size); - VSI_UNREFERENCED(node); - - input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); - CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); - in_shape = input_attr->shape; - - gpu_param.global_scale[0] = 1; - gpu_param.global_scale[1] = 1; - gpu_param.global_scale[2] = 1; - - gpu_param.dim = 2; - gpu_param.global_size[0] = ( - (in_shape->data[1] + gpu_param.global_scale[0] - 1) - / gpu_param.global_scale[0]); - gpu_param.global_size[1] = ( - (in_shape->data[2] + gpu_param.global_scale[1] - 1) - / gpu_param.global_scale[1]); - gpu_param.global_size[2] = 1; - status = vsi_nn_kernel_gpu_config( node, &gpu_param ); - -final: -#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } - SAFE_FREE_TENSOR_ATTR(input_attr); - - return status; -} /* _detect_post_box_initializer() */ - - - -/* - * Query kernel - */ -static vsi_status _query_kernel - ( - vsi_nn_kernel_t * kernel, - vsi_nn_tensor_t * const * const inputs, - vsi_nn_tensor_t * const * const outputs, - vsi_bool *is_use_u8_kernel - ) -{ - vsi_status status = VSI_FAILURE; - vsi_nn_kernel_dtype_e in0_dtype; - vsi_nn_kernel_dtype_e in1_dtype; - vsi_nn_kernel_dtype_e out_dtype; - const _kernel_map_type * kernel_map = _detect_post_box_kernel_map; - size_t kernel_map_size = _cnt_of_array( _detect_post_box_kernel_map ); - vx_param_description_t * param_def = _detect_post_box_kernel_param_def; - size_t param_def_size = _cnt_of_array( _detect_post_box_kernel_param_def ); - vx_kernel_initialize_f initializer = _detect_post_box_initializer; - uint32_t key; - uint32_t i; - - in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); - in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); - out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - - - if ((U8 == in0_dtype) && (U8 == in1_dtype)) - { - *is_use_u8_kernel = TRUE; - param_def_size = _DETECT_POST_BOX_PARAM_NUM; - } - else - { - *is_use_u8_kernel = FALSE; - param_def_size = _DETECT_POST_BOX_F32_PARAM_NUM; - } - - key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype ); - - for ( i = 0; i < kernel_map_size; i ++ ) - { - if ( kernel_map[i].key == key ) - { - break; - } - } - if ( i < kernel_map_size ) - { - snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); - kernel->info.parameters = param_def; - kernel->info.numParams = (vx_uint32)param_def_size; - kernel->info.initialize = initializer; - // Register code source - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, - kernel_map[i].source_name ); - // Register binary source - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - kernel_map[i].source_name ); - status = VSI_SUCCESS; - } - return status; -} /* _query_kernel() */ - - -static vsi_nn_kernel_node_t _setup - ( - vsi_nn_graph_t * graph, - vsi_nn_tensor_t ** inputs, - size_t input_num, - vsi_nn_tensor_t ** outputs, - size_t output_num, - const vsi_nn_kernel_param_t * params, - vsi_nn_kernel_t * kernel - ) -{ - vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t node_params[_DETECT_POST_BOX_PARAM_NUM] = {NULL}; - vsi_nn_kernel_node_t node = NULL; - float logE = (float)(log10(exp(1.0f)) / log10(2.0f)); - float inv_scale_y = vsi_nn_kernel_param_get_float32( params, "inv_scale_y" ); - float inv_scale_x = vsi_nn_kernel_param_get_float32( params, "inv_scale_x" ); - float inv_scale_h = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" ); - float inv_scale_w = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" ); - vsi_bool is_use_u8_kernel = FALSE; - float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); - float input0Zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); - float input0Tail = -input0Zp * input0Scale; - float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); - float input1Zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]); - float input1Tail = -input1Zp * input1Scale; - - status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel ); - - if ( VSI_SUCCESS == status ) - { - size_t node_params_num = _DETECT_POST_BOX_F32_PARAM_NUM; - - node = vsi_nn_kernel_create_node( graph, kernel ); - if ( node ) - { - /* Set inputs and outputs */ - vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_BOX_PARAM_NUM, - inputs, input_num, outputs, output_num ); - node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_y ); - node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_x ); - node_params[SCALAR_SCALE_H] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_h ); - node_params[SCALAR_SCALE_W] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_w ); - node_params[SCALAR_LOG_E] = vsi_nn_kernel_scalar_create( graph, F32, &logE ); - if (is_use_u8_kernel) - { - node_params[SCALAR_TAIL0] = vsi_nn_kernel_scalar_create( graph, F32, &input0Tail ); - node_params[SCALAR_TAIL1] = vsi_nn_kernel_scalar_create( graph, F32, &input1Tail ); - node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale ); - node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale ); - node_params_num = _DETECT_POST_BOX_PARAM_NUM; - } - - /* Pass parameters to node. */ - status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); - VSI_ASSERT( status == VSI_SUCCESS ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_H] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_W] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_LOG_E] ); - if (is_use_u8_kernel) - { - vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL0] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL1] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE0] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE1] ); - } - } - } - return node; -} /* _setup() */ - -__END_DECLS - -REGISTER_BACKEND_CL( detect_post_box, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c deleted file mode 100644 index c278d06..0000000 --- a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c +++ /dev/null @@ -1,197 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - - -#include -#include -#include -#include "vsi_nn_types.h" -#include "vsi_nn_tensor.h" -#include "vsi_nn_graph.h" -#include "vsi_nn_log.h" -#include "vsi_nn_error.h" -#include "vsi_nn_prv.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "kernel/vsi_nn_kernel.h" - -__BEGIN_DECLS -#if 0 -/* - * Define kernel meta. - */ -typedef enum -{ - INTERNAL_KERNEL_DETECT_POST_NMS, -} _internal_kernel_e; - -#define _DETECT_POST_NMS_KERNEL_SOURCE "detect_post_nms" -#define _DETECT_POST_NMS_KERNEL_NAME CVIVANTE_NAMESPACE("cl.detect_post_nms") - -// Add kernel hashtable here -#define DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ - (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) -#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \ - { DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _DETECT_POST_NMS_KERNEL_NAME, SOURCE } - -typedef struct -{ - uint32_t key; - char * function_name; - const char * source_name; -} _kernel_map_type; - -static const _kernel_map_type _detect_post_nms_kernel_map[] = -{ - // Register kernel here - PACK_KERNEL_MAP( F32, F32, _DETECT_POST_NMS_KERNEL_SOURCE ), -}; - - -/* - * Kernel params - */ -static vx_param_description_t _detect_post_nms_kernel_param_def[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, -}; -#define _DETECT_POST_NMS_PARAM_NUM _cnt_of_array( _detect_post_nms_kernel_param_def ) - -#define SCALAR_NMS_TYPE (6) -#define SCALAR_MAX_NUM (7) -#define SCALAR_MAX_CLASS (8) -#define SCALAR_MAX_DETECT (9) -#define SCALAR_SCORE_TH (10) -#define SCALAR_IOU_TH (11) -#define SCALAR_IS_BG (12) - -/* - * Kernel initializer - */ -DEF_KERNEL_INITIALIZER(_detect_post_nms_initializer) - ( - vsi_nn_kernel_node_t node, - const vsi_nn_kernel_node_param_t * param, - size_t param_size - ) -{ - vsi_status status = VSI_FAILURE; - - return status; -} /* _detect_post_nms_initializer() */ - - - -/* - * Query kernel - */ - -static vsi_status _query_kernel - ( - vsi_nn_kernel_t * kernel, - vsi_nn_tensor_t * const * const inputs, - vsi_nn_tensor_t * const * const outputs - ) -{ - vsi_status status = VSI_FAILURE; - vsi_nn_kernel_dtype_e in_dtype; - vsi_nn_kernel_dtype_e out_dtype; - const _kernel_map_type * kernel_map = _detect_post_nms_kernel_map; - size_t kernel_map_size = _cnt_of_array( _detect_post_nms_kernel_map ); - vx_param_description_t * param_def = _detect_post_nms_kernel_param_def; - size_t param_def_size = _cnt_of_array( _detect_post_nms_kernel_param_def ); - vx_kernel_initialize_f initializer = _detect_post_nms_initializer; - - uint32_t key; - uint32_t i; - - in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); - out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - - key = DETECT_POST_NMS_HASH_KEY( in_dtype, out_dtype ); - - for ( i = 0; i < kernel_map_size; i++ ) - { - if ( kernel_map[i].key == key ) - { - break; - } - } - if ( i < kernel_map_size ) - { - snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); - kernel->info.parameters = param_def; - kernel->info.numParams = param_def_size; - kernel->info.initialize = initializer; - // Register code source - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, - kernel_map[i].source_name ); - // Register binary source - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - kernel_map[i].source_name ); - status = VSI_SUCCESS; - } - return status; -} /* _query_kernel() */ -#endif - -static vsi_nn_kernel_node_t _setup - ( - vsi_nn_graph_t * graph, - vsi_nn_tensor_t ** inputs, - size_t input_num, - vsi_nn_tensor_t ** outputs, - size_t output_num, - const vsi_nn_kernel_param_t * params, - vsi_nn_kernel_t * kernel - ) -{ - vsi_nn_kernel_node_t node = NULL; - - VSI_UNREFERENCED(graph); - VSI_UNREFERENCED(inputs); - VSI_UNREFERENCED(input_num); - VSI_UNREFERENCED(outputs); - VSI_UNREFERENCED(output_num); - VSI_UNREFERENCED(params); - VSI_UNREFERENCED(kernel); - - return node; -} /* _setup() */ - -__END_DECLS - -REGISTER_BACKEND_CL( detect_post_nms, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c index c44010a..c34a1e4 100644 --- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -60,6 +60,7 @@ typedef enum UNARY_ATANH, UNARY_ACOSH, UNARY_INVERSE_SIGMOID, + UNARY_TAN, } unary_type_e; /* @@ -108,6 +109,7 @@ typedef enum #define ATANH_OPERATION atanh #define ACOSH_OPERATION acosh #define INVERSE_SIGMOID_OPERATION inverse_sigmoid +#define TAN_OPERATION tan #define ADD_UNARY_SH_KERNELS(name) \ TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F32, F32) \ @@ -142,6 +144,7 @@ static const struct { ADD_UNARY_SH_KERNELS(ATANH) ADD_UNARY_SH_KERNELS(ACOSH) ADD_UNARY_SH_KERNELS(INVERSE_SIGMOID) + ADD_UNARY_SH_KERNELS(TAN) TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32) TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32) @@ -166,6 +169,7 @@ static const struct { #undef ATANH_OPERATION #undef ACOSH_OPERATION #undef INVERSE_SIGMOID_OPERATION +#undef TAN_OPERATION /* * Kernel params */ @@ -452,16 +456,22 @@ OnError: REGISTER_BACKEND_CL( KERNEL_NAME, _##KERNEL_NAME##_setup ) +#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT) REGISTER_ELTWISE_UNARY_BACKEND_CL( sin, UNARY_SIN ) REGISTER_ELTWISE_UNARY_BACKEND_CL( cos, UNARY_COS ) +#endif +#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT) REGISTER_ELTWISE_UNARY_BACKEND_CL( exp, UNARY_EXP ) +#endif REGISTER_ELTWISE_UNARY_BACKEND_CL( log, UNARY_LOG ) REGISTER_ELTWISE_UNARY_BACKEND_CL( neg, UNARY_NEG ) REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID ) REGISTER_ELTWISE_UNARY_BACKEND_CL( mish, UNARY_MISH ) REGISTER_ELTWISE_UNARY_BACKEND_CL( round, UNARY_ROUND ) +#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT) REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu, UNARY_GELU ) REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu, UNARY_HGELU ) +#endif REGISTER_ELTWISE_UNARY_BACKEND_CL( selu, UNARY_SELU ) REGISTER_ELTWISE_UNARY_BACKEND_CL( celu, UNARY_CELU ) REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp, UNARY_RCP ) @@ -471,5 +481,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( atan, UNARY_ATAN ) REGISTER_ELTWISE_UNARY_BACKEND_CL( atanh, UNARY_ATANH ) REGISTER_ELTWISE_UNARY_BACKEND_CL( acosh, UNARY_ACOSH ) REGISTER_ELTWISE_UNARY_BACKEND_CL( inverse_sigmoid, UNARY_INVERSE_SIGMOID ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( tan, UNARY_TAN ) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c index e6a6743..6694331 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c @@ -22,7 +22,7 @@ * *****************************************************************************/ - +#if !(VX_TENSOR_GATHER_API_SUPPORT) #include #include #include @@ -420,3 +420,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( gather, _setup ) +#endif diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c index 193f388..4ec8672 100644 --- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c @@ -90,6 +90,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; #define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM _cnt_of_array( _grucell_activation_z_h_kernel_param_def ) @@ -97,6 +99,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] = #define SCALAR_INPUT_TAIL (8) #define SCALAR_OUTPUT_SCALE (9) #define SCALAR_OUTPUT_ZP (10) +#define SCALAR_OUTPUT1_SCALE (11) +#define SCALAR_OUTPUT1_ZP (12) /* * Kernel initializer */ @@ -244,6 +248,8 @@ static vsi_nn_kernel_node_t _setup float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[GRUCELL_ACT_Z_H_HSTATE]) * input_scale; float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]); float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]); + float output_scale1 = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]); + float output_zp1 = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]); if( activation != VSI_NN_ACT_TANH ) { @@ -268,11 +274,17 @@ static vsi_nn_kernel_node_t _setup graph, F32, &output_scale ); node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); + node_params[SCALAR_OUTPUT1_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &output_scale1 ); + node_params[SCALAR_OUTPUT1_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &output_zp1 ); status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_ZP] ); } } return node; diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c index a99f8b9..f88b6d9 100644 --- a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c @@ -46,6 +46,7 @@ typedef enum _grucell_nn_activation_type_e { SIGMOID = VSI_NN_ACT_SIGMOID, HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, + RELU = VSI_NN_ACT_RELU, }grucell_nn_activation_type_e; #define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation" @@ -71,6 +72,9 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] = PACK_KERNEL_MAP( U8, F32, U8, SIGMOID ), PACK_KERNEL_MAP( I32, F32, I32, SIGMOID ), PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ), + PACK_KERNEL_MAP( U8, F32, U8, RELU ), + PACK_KERNEL_MAP( I32, F32, I32, RELU ), + PACK_KERNEL_MAP( F32, F32, F32, RELU ), }; diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c index a13ec2e..cecb25a 100644 --- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c @@ -22,7 +22,7 @@ * *****************************************************************************/ - +#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) #include #include #include @@ -360,3 +360,4 @@ final: __END_DECLS REGISTER_BACKEND_CL( layer_norm, _setup ) +#endif diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c index f7089bf..a7bcaae 100644 --- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_LOGSOFTMAX_VX_SUPPORT) #include #include #include @@ -34,6 +35,7 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" __BEGIN_DECLS @@ -41,27 +43,30 @@ __BEGIN_DECLS /* * Define kernel meta. */ -#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d) \ - ((_axis << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d)) +#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d, exceed_limit) \ + ((_axis << 24) | (_input_type << 16) | (_output_type << 8) | (_image_2d << 4) | exceed_limit) #define VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_axis) \ "log_softmax_axis"#_axis + #define VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_axis) \ + "log_softmax_exceed_axis"#_axis + #define HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE) #define TENSOR_LOG_SOFTMAX_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \ - { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \ HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, #define TENSOR_LOG_SOFTMAX_FLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \ - { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \ HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \ VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, #define TENSOR_LOG_SOFTMAX_BFLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \ - { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \ HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, @@ -69,20 +74,28 @@ __BEGIN_DECLS CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D") #define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ - { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \ HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, #define TENSOR_LOG_SOFTMAX_FLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ - { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \ HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \ VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, #define TENSOR_LOG_SOFTMAX_BFLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ - { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \ HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, +#define HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.log_softmax_exceed_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE) + +#define TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 1), \ + HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ + VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(AXIS) }, + static const struct { uint32_t key; char* function_name; @@ -92,31 +105,31 @@ static const struct { TENSOR_LOG_SOFTMAX_FLOAT(0, F32, F32) TENSOR_LOG_SOFTMAX_FLOAT(1, F32, F32) TENSOR_LOG_SOFTMAX_FLOAT(2, F32, F32) - TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16) - TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16) - TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16) TENSOR_LOG_SOFTMAX_BFLOAT(0, BF16, BF16) TENSOR_LOG_SOFTMAX_BFLOAT(1, BF16, BF16) TENSOR_LOG_SOFTMAX_BFLOAT(2, BF16, BF16) TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32) TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32) - TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16) - TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16) TENSOR_LOG_SOFTMAX_BFLOAT_2D(0, BF16, BF16) TENSOR_LOG_SOFTMAX_BFLOAT_2D(1, BF16, BF16) - TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8) - TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8) - TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8) TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8) TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8) TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8) TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8) TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8) - TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8) - TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8) + + TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, U8) + TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, U8) + + TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, F32, F32) + TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, F32, F32) + + TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16) + TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16) + }; /* @@ -198,12 +211,89 @@ final: return status; } /* _log_softmax_initializer() */ +DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_size_array_t * out_shape = NULL; + int32_t axis = 0; + int32_t width = 0; + int32_t height = 0; + int32_t depth = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + out_shape = attr[1]->shape; + + width = (int32_t)(out_shape->data[0]); + height = (int32_t)(out_shape->data[1]); + depth = attr[1]->shape->size > 2 ? (int32_t)(out_shape->data[2]) : 1; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + if (axis == 0) + { + gpu_param.global_size[0] = 1; + gpu_param.global_size[1] = depth; + } + else + { + gpu_param.global_size[0] = width; + gpu_param.global_size[1] = 1; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + if (axis == 0) + { + status |= vsi_nn_kernel_gpu_add_param( node, "width", &width ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth ); + } + status |= vsi_nn_kernel_gpu_add_param( node, "height", &height ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + } + + return status; +} + static vsi_status _query_kernel ( vsi_nn_tensor_t* const* const inputs, vsi_nn_tensor_t* const* const outputs, int32_t axis, vsi_bool image_2d, + vsi_bool exceed_limit, vsi_nn_kernel_t* kernel ) { @@ -215,7 +305,17 @@ static vsi_status _query_kernel input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d ); + + if (input_dtype == F16) + { + input_dtype = F32; + } + if (output_dtype == F16) + { + output_dtype = F32; + } + if (exceed_limit) image_2d = vx_false_e; + key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d, exceed_limit ); for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) { @@ -229,7 +329,14 @@ static vsi_status _query_kernel snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); kernel->info.parameters = kernel_param_def; kernel->info.numParams = _cnt_of_array( kernel_param_def ); - kernel->info.initialize = _log_softmax_initializer; + if (exceed_limit) + { + kernel->info.initialize = _log_softmax_exceed_initializer; + } + else + { + kernel->info.initialize = _log_softmax_initializer; + } vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, kernel_map[i].source_name ); vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, @@ -254,7 +361,14 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; int32_t axis = 0; + int32_t new_axis = 0; + vsi_bool ret = vx_false_e; + vsi_bool exceed_limit = vx_false_e; + uint32_t i = 0; float beta = 0; float inputScale = vsi_nn_get_tensor_scale(inputs[0]); float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); @@ -270,16 +384,37 @@ static vsi_nn_kernel_node_t _setup scaleValue = scaleValue * beta * inputScale; beta = beta * inputScale; - if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, - inputs[0]->attr.dim_num ) - || axis > 2) + if (inputs[0]->attr.size[axis] >= GPU_TENSOR_MAX_WIDTH) + { + exceed_limit = vx_true_e; + } + + ret = vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rank_in, &new_axis); + + if (ret) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[0], rank_in ); + } + else { return NULL; } - image_2d = ((inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1) - && axis != 2); - status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size, + reshape_tensors[0]->attr.dim_num ) + || new_axis > 2 || (new_axis == 2 && exceed_limit)) + { + return NULL; + } + + image_2d = ((reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1) + && new_axis != 2); + status = _query_kernel( inputs, outputs, new_axis, image_2d, exceed_limit, kernel ); if( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -287,10 +422,10 @@ static vsi_nn_kernel_node_t _setup if( node ) { vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, - inputs, 1, outputs, 1 ); + reshape_tensors, 1, &reshape_tensors[1], 1 ); node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( - graph, I32, &axis ); + graph, I32, &new_axis ); node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create( graph, F32, &beta ); node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( @@ -311,9 +446,16 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); } } + + for (i = 0; i < 2; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + return node; } /* _setup() */ __END_DECLS REGISTER_BACKEND_CL( log_softmax, _setup ) +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c index ac342d3..f139ccb 100644 --- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c @@ -75,6 +75,9 @@ __BEGIN_DECLS #define HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ CVIVANTE_NAMESPACE("cl.gemm_4x_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) +#define HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ + CVIVANTE_NAMESPACE("cl.gemm_4x_transa_local_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) + #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 0), \ HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ @@ -90,6 +93,11 @@ __BEGIN_DECLS HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ SOURCE }, +#define TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ + {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 1, 0), \ + HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ + SOURCE }, + #define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 1, 0), \ HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ @@ -142,6 +150,7 @@ static const struct { TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3) TENSOR_MATRIXMUL_4X_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4) TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4) + TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4) }; /* @@ -313,6 +322,49 @@ final: return status; } /* _matrixmul_4x_initializer() */ +DEF_KERNEL_INITIALIZER(_matrixmul_4x_local_initializer) +(vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t* param, + size_t param_size) { + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; + + vsi_nn_kernel_tensor_attr_t* attr = NULL; + vsi_size_t width = 0; + + + VSI_UNREFERENCED(param_size); + + attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]); + CHECK_PTR_FAIL_GOTO(attr, "Create tensor attr buffer fail.", final); + + width = attr->shape->data[0]; + + gpu_param.dim = 2; + gpu_param.local_size[0] = 1; + gpu_param.local_size[1] = 64; + gpu_param.local_size[2] = 1; + + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = + (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0]; + gpu_param.global_size[1] = 64; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config(node, &gpu_param); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr) { + vsi_nn_kernel_tensor_attr_release(&attr); + attr = NULL; + } + return status; +} /* _matrixmul_4x_local_initializer() */ + static vsi_status _query_kernel ( vsi_nn_kernel_t * kernel, @@ -403,7 +455,10 @@ static vsi_status _query_kernel kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def ); } - if (flag_4x) { + if ((flag_4x == 2) && (transa == 1)) { + kernel->info.initialize = _matrixmul_4x_local_initializer; + } + else if (flag_4x == 1) { kernel->info.initialize = _matrixmul_4x_initializer; } else { kernel->info.initialize = _matrixmul_initializer; @@ -471,6 +526,7 @@ static vsi_nn_kernel_node_t _setup uint32_t stride_axis_in_out[9] = {0}; vsi_nn_tensor_t* tmp_inputs[2] = {NULL}; vsi_nn_tensor_t* tmp_outputs[1] = {NULL}; + vsi_bool shader_cnt_support = FALSE; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); @@ -585,7 +641,20 @@ static vsi_nn_kernel_node_t _setup rs_out_tensors = vsi_nn_reshape_tensor(graph, tmp_outputs[0], final_shape, final_rank); final_out_tensors[0] = rs_out_tensors; - flag_4x = 1; + +#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT + shader_cnt_support = + (graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE; +#endif + if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support) + { + flag_4x = 2; + } + else + { + flag_4x = 1; + } + } } diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c index 3446fef..33bacb0 100644 --- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c @@ -246,28 +246,49 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - - float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); - float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale; - float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); - float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; - float outputScale = vsi_nn_get_tensor_scale(outputs[0]); - float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float input0_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale; + float input1_scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale; + float output_scale = vsi_nn_get_tensor_scale(outputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t new_rank = 0; + vsi_bool ret = TRUE; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(params); - outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; + output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale; - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) + ret = vsi_nn_kernel_optimize_eltwise_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if (ret == FALSE) { - return NULL; + goto final; } - image_2d = (outputs[0]->attr.dim_num == 2); - status = _query_kernel( inputs, outputs, image_2d, kernel ); + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[2], new_rank ); + + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size, + reshape_tensors[2]->attr.dim_num ) ) + { + goto final; + } + + image_2d = (reshape_tensors[2]->attr.dim_num == 2); + status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -275,19 +296,19 @@ static vsi_nn_kernel_node_t _setup if ( node ) { vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, - inputs, 2, outputs, 1 ); + reshape_tensors, 2, &reshape_tensors[2], 1 ); node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( - graph, F32, &input0Scale ); + graph, F32, &input0_scale ); node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create( - graph, F32, &input0Tail ); + graph, F32, &input0_tail ); node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( - graph, F32, &input1Scale ); + graph, F32, &input1_scale ); node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create( - graph, F32, &input1Tail ); + graph, F32, &input1_tail ); node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( - graph, F32, &outputScale ); + graph, F32, &output_scale ); node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( - graph, F32, &outputZP ); + graph, F32, &output_zp ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); @@ -300,6 +321,12 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); } } + +final: + vsi_safe_release_tensor(reshape_tensors[0]); + vsi_safe_release_tensor(reshape_tensors[1]); + vsi_safe_release_tensor(reshape_tensors[2]); + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c index 5d85656..4d607b6 100644 --- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c @@ -246,29 +246,49 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - - float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); - float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale; - float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); - float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; - float outputScale = vsi_nn_get_tensor_scale(outputs[0]); - float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float input0_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale; + float input1_scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale; + float output_scale = vsi_nn_get_tensor_scale(outputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t new_rank = 0; + vsi_bool ret = TRUE; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(params); + output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale; - outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; + ret = vsi_nn_kernel_optimize_eltwise_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) + if (ret == FALSE) { - return NULL; + goto final; } - image_2d = (outputs[0]->attr.dim_num == 2); - status = _query_kernel( inputs, outputs, image_2d, kernel ); + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[2], new_rank ); + + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size, + reshape_tensors[2]->attr.dim_num ) ) + { + goto final; + } + + image_2d = (reshape_tensors[2]->attr.dim_num == 2); + status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -276,19 +296,19 @@ static vsi_nn_kernel_node_t _setup if ( node ) { vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, - inputs, 2, outputs, 1 ); + reshape_tensors, 2, &reshape_tensors[2], 1 ); node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( - graph, F32, &input0Scale ); + graph, F32, &input0_scale ); node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create( - graph, F32, &input0Tail ); + graph, F32, &input0_tail ); node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( - graph, F32, &input1Scale ); + graph, F32, &input1_scale ); node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create( - graph, F32, &input1Tail ); + graph, F32, &input1_tail ); node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( - graph, F32, &outputScale ); + graph, F32, &output_scale ); node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( - graph, F32, &outputZP ); + graph, F32, &output_zp ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); @@ -301,6 +321,12 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); } } + +final: + vsi_safe_release_tensor(reshape_tensors[0]); + vsi_safe_release_tensor(reshape_tensors[1]); + vsi_safe_release_tensor(reshape_tensors[2]); + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c index 6a38b4e..06e3652 100644 --- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_TENSOR_POW_API_SUPPORT) #include #include #include @@ -294,4 +295,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( pow, _setup ) - +#endif diff --git a/src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c new file mode 100644 index 0000000..46d2977 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c @@ -0,0 +1,320 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +#define _RESIZE_CUBIC_KERNEL_SOURCE() "resize_cubic" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) ) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_CUBIC_KERNEL_SOURCE() } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_cubic_kernel_map[] = +{ + PACK_KERNEL_MAP( F32, F32), + PACK_KERNEL_MAP( U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_cubic_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define SCALAR_SCALE_X (2) +#define SCALAR_SCALE_Y (3) +#define SCALAR_HALF_PIXEL (4) +#define SCALAR_INPUT_SCALE (5) +#define SCALAR_INPUT_TAIL (6) +#define SCALAR_OUTPUT_SCALE (7) +#define SCALAR_OUTPUT_TAIL (8) + + +#define RESIZE_CUBIC_NUM 5 +#define RESIZE_CUBIC_QUANT_NUM _cnt_of_array( _resize_cubic_kernel_param_def ) + + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_cubic_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_size_array_t * out_shape = NULL; + + VSI_UNREFERENCED(param_size); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _resize_cubic_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool *is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_cubic_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_cubic_kernel_map ); + vx_param_description_t * param_def = _resize_cubic_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_cubic_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_cubic_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if ((U8 == in_dtype) || (U8 == out_dtype)) + { + param_def_size = RESIZE_CUBIC_QUANT_NUM; + *is_use_u8_kernel = TRUE; + } + else + { + param_def_size = RESIZE_CUBIC_NUM; + *is_use_u8_kernel = FALSE; + } + + key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_QUANT_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + vsi_size_t in_width = inputs[0]->attr.size[0]; + vsi_size_t in_height = inputs[0]->attr.size[1]; + vsi_size_t out_width = outputs[0]->attr.size[0]; + vsi_size_t out_height = outputs[0]->attr.size[1]; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input_tail = -(input_zp * input_scale); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float half_pixel_value = 0.0f; + float scale_factor_x = 0.0f; + float scale_factor_y = 0.0f; + vsi_bool is_use_u8_kernel = FALSE; + + if (align_corners && out_width > 1) + { + scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (align_corners && out_height > 1) + { + scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1); + } + else + { + scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + + status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel ); + if (VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + size_t node_params_num = RESIZE_CUBIC_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_QUANT_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x ); + node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create(graph, F32, &scale_factor_y ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value ); + if (is_use_u8_kernel) + { + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input_tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &output_zp ); + node_params_num = RESIZE_CUBIC_QUANT_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + if (is_use_u8_kernel) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); + } + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( resize_cubic, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c new file mode 100644 index 0000000..299a2f6 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c @@ -0,0 +1,727 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +typedef enum +{ + NONE = 0, + Add, + Mul, + Max, + Min +} vsi_scatter_nd_update_type_e; + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "scatter_nd_update_reduction" +#define KERNEL_SOURCE_2 "scatter_nd_update_reduction_conv" + +#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _stage, _op) \ + ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_stage << 4) | (_op)) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \ + CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_conv_"#DST_TYPE) + +#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \ + SOURCE }, + +#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \ + SOURCE }, + +#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F32, KERNEL_SOURCE_1) +}; + +static const _kernel_map_type scatter_nd_update_reduction_process_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F32, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F32, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F32, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F32, KERNEL_SOURCE_1) +}; + +static const _kernel_map_type scatter_nd_update_reduction_conv_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F32, KERNEL_SOURCE_2) +}; + +/* + * Kernel params + */ +static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def) +#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_process_kernel_param_def) +#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM _cnt_of_array(_scatter_nd_update_conv_kernel_param_def) + +static vsi_status cal_scatter_nd_update_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t coordDim, + vsi_size_t strides[VSI_NN_MAX_DIM_NUM], + int32_t* newDim + ) +{ + vsi_status status = VSI_SUCCESS; + uint32_t dims_num = inputs[0]->attr.dim_num; + vsi_size_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + vsi_size_t elementCnt = 1; + +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH + + newDim[0] = 0; + for (i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + newDim[0] = 2; + + if (coordDim == 1 && strides) // index shape + { + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + strides[i] = 0; + } + } + else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides) + { + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + strides[i] = 0; + } + + strides[0] = input_size[dims_num - coordDim]; + for (i = 1; i < coordDim - 1; i++) + { + strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i]; + } + } + +#undef VSI_NN_MAX_IMAGE_WIDTH + + return status; +} /* cal_scatter_nd_update_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_scatter_nd_update_reduction_preprocess_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 1, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t width = 0; + int32_t element_size = 1; + int32_t i = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + for (i = 0; i < (int32_t)attr[0]->shape->size; i++) + { + element_size *= (int32_t)attr[0]->shape->data[i]; + } + width = element_size / 8; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + if (element_size < 8) + { + gpu_param.global_size[0] = element_size; + } + else + { + gpu_param.global_size[0] = width; + } + gpu_param.global_size[1] = 1; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _scatter_nd_update_reduction_preprocess_initializer() */ + +DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + int32_t block_size = 1; + int32_t index_num = 1; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + block_size = (int32_t)(attr[1]->shape->data[0]); + index_num = (int32_t)(attr[0]->shape->data[1]); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = block_size; + gpu_param.global_size[1] = index_num; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + return status; +} /* _scatter_nd_update_process_initializer() */ + +DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 1, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t width = 0; + int32_t element_size = 1; + int32_t i = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + for (i = 0; i < (int32_t)attr[0]->shape->size; i++) + { + element_size *= (int32_t)attr[0]->shape->data[i]; + } + width = element_size / 8; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + if (element_size < 8) + { + gpu_param.global_size[0] = element_size; + } + else + { + gpu_param.global_size[0] = width; + } + gpu_param.global_size[1] = 1; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _scatter_nd_update_conv_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel_preprocess, + vsi_nn_kernel_t* kernel_process, + vsi_nn_kernel_t* kernel_conv, + int32_t reduction_flg + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input2_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + size_t i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, 0, 0, 0, 0 ); + + for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ ) + { + if ( scatter_nd_update_reduction_preprocess_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) ) + { + snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_reduction_preprocess_map[i].function_name ); + kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def; + kernel_preprocess->info.numParams = _cnt_of_array( _scatter_nd_update_preprocess_kernel_param_def ); + kernel_preprocess->info.initialize = _scatter_nd_update_reduction_preprocess_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + scatter_nd_update_reduction_preprocess_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_reduction_preprocess_map[i].source_name ); + status = VSI_SUCCESS; + } + + key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg ); + + for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ ) + { + if ( scatter_nd_update_reduction_process_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) ) + { + snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_reduction_process_map[i].function_name ); + kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def; + kernel_process->info.numParams = _cnt_of_array( _scatter_nd_update_process_kernel_param_def ); + kernel_process->info.initialize = _scatter_nd_update_process_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + scatter_nd_update_reduction_process_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_reduction_process_map[i].source_name ); + status = VSI_SUCCESS; + } + + key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0 ); + + for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ ) + { + if ( scatter_nd_update_reduction_conv_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) ) + { + snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_reduction_conv_map[i].function_name ); + kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def; + kernel_conv->info.numParams = _cnt_of_array( _scatter_nd_update_conv_kernel_param_def ); + kernel_conv->info.initialize = _scatter_nd_update_conv_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + scatter_nd_update_reduction_conv_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_reduction_conv_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_t node = NULL; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t strides[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t coord_strides[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" ); + int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input_zp_scale = 0 - input_zp * input_scale; + float update_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]); + float update_scale = vsi_nn_get_tensor_scale(inputs[2]); + float update_zp_scale = 0 - update_zp * update_scale; + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + vsi_nn_tensor_t * tensors[2] = { NULL }; + vsi_nn_kernel_t * ikernels[2] = { NULL }; + int32_t i = 0; + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0, + NULL, &rs_idx_dim); + status |= cal_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0, + NULL, &rs_in_dim); + status |= cal_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim, + strides, &rs_out_dim); + CHECK_STATUS_FAIL_GOTO( status, final ); + + coord_strides[coord_dim - 1] = 1; + for (i = 0; i < coord_dim - 1; i++) + { + coord_strides[i] = (int32_t)strides[coord_dim - 2 - i]; + } + + { + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_node_t preprocess_node = NULL; + vsi_nn_kernel_node_t process_node = NULL; + vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL }; + int32_t width = 1; + int32_t res = 0; + int32_t update_width = (int32_t)shapes[1][0]; + int32_t output_width = (int32_t)shapes[2][0]; + + ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL ); + ikernels[0]->unique_id = kernel->unique_id; + ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL ); + ikernels[1]->unique_id = kernel->unique_id; + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype = outputs[0]->attr.dtype; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.is_const = FALSE; + attr.vtl = TRUE; + + for (i = 0; i < rs_out_dim; i++) + { + attr.size[i] = shapes[2][i]; + width *= (int32_t)shapes[2][i]; + } + attr.dim_num = rs_out_dim; + + res = width % 8; + width = (width >> 3) << 3; + + tensors[0] = vsi_nn_CreateTensor( graph, &attr ); // ref' + attr.size[0] = 1; + attr.size[1] = 1; + attr.dim_num = rs_out_dim; + tensors[1] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer0 + + status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction); + if ( VSI_SUCCESS == status) + { + // convert ref to float + preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] ); + if (preprocess_node) + { + uint32_t index = 0; + /* Pass parameters to node. */ + preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim ); + preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res ); + preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp_scale ); + status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params, + _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &preprocess_params[0] ); + vsi_nn_kernel_scalar_release( &preprocess_params[2] ); + vsi_nn_kernel_scalar_release( &preprocess_params[3] ); + vsi_nn_kernel_scalar_release( &preprocess_params[4] ); + vsi_nn_kernel_scalar_release( &preprocess_params[5] ); + } + + // update + process_node = vsi_nn_kernel_create_node( graph, ikernels[1] ); + if (process_node) + { + uint32_t index = 0; + /* Pass parameters to node. */ + process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); + process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); + process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[0] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[1] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[2] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[3] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[4] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[5] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[6] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &update_width ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_width ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_scale ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_zp_scale ); + status = vsi_nn_kernel_node_pass_param( process_node, process_params, + _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &process_params[0] ); + vsi_nn_kernel_tensor_release( &process_params[1] ); + vsi_nn_kernel_scalar_release( &process_params[4] ); + vsi_nn_kernel_scalar_release( &process_params[5] ); + vsi_nn_kernel_scalar_release( &process_params[6] ); + vsi_nn_kernel_scalar_release( &process_params[7] ); + vsi_nn_kernel_scalar_release( &process_params[8] ); + vsi_nn_kernel_scalar_release( &process_params[9] ); + vsi_nn_kernel_scalar_release( &process_params[10] ); + vsi_nn_kernel_scalar_release( &process_params[11] ); + vsi_nn_kernel_scalar_release( &process_params[12] ); + vsi_nn_kernel_scalar_release( &process_params[13] ); + vsi_nn_kernel_scalar_release( &process_params[14] ); + vsi_nn_kernel_scalar_release( &process_params[15] ); + } + + // convert float to output + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res ); + conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); + status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &conv_params[2] ); + vsi_nn_kernel_scalar_release( &conv_params[3] ); + vsi_nn_kernel_scalar_release( &conv_params[4] ); + vsi_nn_kernel_scalar_release( &conv_params[5] ); + vsi_nn_kernel_scalar_release( &conv_params[6] ); + } + } + + if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );} + if (process_node) {vsi_nn_kernel_node_release( &process_node );} + } + +final: + if (ikernels[0]) + { + vsi_nn_kernel_release(&ikernels[0]); + } + if (ikernels[1]) + { + vsi_nn_kernel_release(&ikernels[1]); + } + vsi_safe_release_tensor(tensors[0]); + vsi_safe_release_tensor(tensors[1]); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( scatter_nd_update_reduction, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/select_cl.c b/src/tim/vx/internal/src/kernel/cl/select_cl.c index ab44901..d2634c4 100644 --- a/src/tim/vx/internal/src/kernel/cl/select_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_TENSOR_SELECT_VX_SUPPORT) #include #include #include @@ -359,3 +360,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( select, _setup ) +#endif diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c index 266b8ed..8227a36 100644 --- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_TENSOR_TILE_API_SUPPORT) #include #include #include @@ -445,3 +446,4 @@ final: __END_DECLS REGISTER_BACKEND_CL( tile, _setup ) +#endif diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c index 3d68840..b8cdfd0 100644 --- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c @@ -438,7 +438,7 @@ static vsi_nn_kernel_node_t _setup vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; int32_t width = (int32_t)block_size; int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k"); - int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f)); + int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0); vsi_bool is_odd_even_sort = FALSE; size_t param_num = _TOPK_PARAM_NUM; float inputScale = vsi_nn_get_tensor_scale(inputs[0]); diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c index e1861a2..1fcd398 100644 --- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c @@ -106,14 +106,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) vsi_nn_kernel_dtype_e output_dtype = F16; vsi_nn_kernel_tensor_attr_t *input0_attr = NULL, *input1_attr = NULL, *output_attr = NULL; vsi_size_array_t *input_shape = NULL; - float scaleIn = 1.0f; - int32_t input_ZP = 0; - float scaleIn1 = 1.0f; - int32_t input_ZP1 = 0; - float scaleOut = 1.0f; - int32_t output_ZP = 0; - int32_t fixpoint = 0, fixpoint1 = 0, fixpoint_out = 0; - float inScale_dfp, inScale_dfp1; + float scaleIn = 1.0f; + int32_t input_ZP = 0; + float scaleIn1 = 1.0f; + int32_t input_ZP1 = 0; + float scaleOut = 1.0f; + int32_t output_ZP = 0; float eps = 0.0f; float rsEps = 0.0f; float dimRatio = 0.0f; @@ -135,80 +133,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) rsEps = (float)(1.0f / sqrtf(eps)); dimRatio = (float)(1.0 / (input_shape->data[0])); - - if ( VSI_NN_KERNEL_QUANT_DFP == input0_attr->quant ) - { - fixpoint = input0_attr->dfp.fl; - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == input0_attr->quant ) - { - input_ZP = input0_attr->asymm.zero_point; - scaleIn = input0_attr->asymm.scale; - } - else - { - input_ZP = 0; - scaleIn = 1.0f; - } - - //input1 - if ( VSI_NN_KERNEL_QUANT_DFP == input1_attr->quant ) - { - fixpoint1 = input1_attr->dfp.fl; - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant ) - { - input_ZP1 = input1_attr->asymm.zero_point; - scaleIn1 = input1_attr->asymm.scale; - } - else - { - input_ZP1 = 0; - scaleIn1 = 1.0f; - } - - //output - if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant ) - { - fixpoint_out = output_attr->dfp.fl; - if (fixpoint_out >= 0) - { - scaleOut = 1.0f / (vx_float32) ((int64_t)1 << fixpoint_out); - } - else - { - scaleOut = (vx_float32) ((int64_t)1 << -fixpoint_out); - } - output_ZP = 0; - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) - { - output_ZP = output_attr->asymm.zero_point; - scaleOut = output_attr->asymm.scale; - } - else - { - output_ZP = 0; - scaleOut = 1.0f; - } - - if (fixpoint >= 0) - { - inScale_dfp = 1.0f / (vx_float32) ((int64_t)1 << fixpoint); - } - else - { - inScale_dfp = (vx_float32) ((int64_t)1 << -fixpoint); - } - - if (fixpoint1 >= 0) - { - inScale_dfp1 = 1.0f / (vx_float32) ((int64_t)1 << fixpoint1); - } - else - { - inScale_dfp1 = (vx_float32) ((int64_t)1 << -fixpoint1); - } + scaleIn = input0_attr->scale; + input_ZP = input0_attr->zero_point; + scaleIn1 = input1_attr->scale; + input_ZP1 = input1_attr->zero_point; + scaleOut = output_attr->scale; + output_ZP = output_attr->zero_point; gpu_param.global_offset[0] = 0; gpu_param.global_offset[1] = 0; @@ -349,8 +279,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) &uniConvertInt16ScaleToFp32Fst_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16ScaleToFp32Sec_4x4", &uniConvertInt16ScaleToFp32Sec_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &inScale_dfp); - status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &inScale_dfp1); + status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &scaleIn1); CHECK_STATUS_FAIL_GOTO(status, final ); } width = (int32_t)input_shape->data[0]; diff --git a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c index 80a1b21..e189ce3 100644 --- a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c @@ -215,41 +215,11 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer) output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); - if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = input_attr->dfp.fl; - if (fl > 0) - { - input_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input_scale = (float)((int64_t)1 << -fl); - } - } - else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - input_scale = input_attr->asymm.scale; - input_tail = 0 - input_scale * (float)input_attr->asymm.zero_point; - } + input_scale = input_attr->scale; + input_tail = 0 - input_scale * (float)input_attr->zero_point; - if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = output_attr->dfp.fl; - if (fl > 0) - { - output_scale = (float) ((int64_t)1 << fl); - } - else - { - output_scale = 1.0f / (float)((int64_t)1 << -fl); - } - } - else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - output_scale = 1.0f / output_attr->asymm.scale; - output_zp = (float)output_attr->asymm.zero_point; - } + output_scale = 1.0f / output_attr->scale; + output_zp = (float)output_attr->zero_point; pack_key = _PACK_BATCH_NORM_KEY( input_attr->dtype, output_attr->dtype ); diff --git a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c index 553f8b7..a94c93c 100644 --- a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c @@ -121,23 +121,20 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) vsi_nn_kernel_dtype_e output_dtype = F16; uint32_t depth = 0; - float half_input0_wh[2]; - float add_float_value[2]; - uint32_t in0_width; - uint32_t in0_height; - uint32_t out_width; - uint32_t out_height; - int32_t align_corners; + float half_input0_wh[2] = {0}; + float add_float_value[2] = {0}; + uint32_t in0_width = 0; + uint32_t in0_height = 0; + uint32_t out_width = 0; + uint32_t out_height = 0; + int32_t align_corners = 0; - int32_t src0FixPointPos = 0; - int32_t src1FixPointPos = 0; - int32_t dstFixPointPos = 0; - float input0_scale = 1.0; - int32_t input0ZP = 0; - float input1_scale = 1.0; - int32_t input1ZP = 0; - float output_scale = 1.0; - int32_t outputZP = 0; + float input0_scale = 1.0; + int32_t input0ZP = 0; + float input1_scale = 1.0; + int32_t input1ZP = 0; + float output_scale = 1.0; + int32_t outputZP = 0; VSI_UNREFERENCED(param_size); @@ -165,54 +162,14 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) input1_dtype = input_attr[1]->dtype; output_dtype = output_attr->dtype; - if (U8 == input0_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant) { - input0_scale = input_attr[0]->asymm.scale; - input0ZP = input_attr[0]->asymm.zero_point; - } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant) { - src0FixPointPos = input_attr[0]->dfp.fl; - if (src0FixPointPos >= 0) { - input0_scale = 1.0f / (float)((int64_t)1 << src0FixPointPos); - } else if (src0FixPointPos < 0) { - input0_scale = (float)((int64_t)1 << -src0FixPointPos); - } - input0ZP = 0; - } else { - input0_scale = 1.0f; - input0ZP = 0; - } + input0_scale = input_attr[0]->scale; + input0ZP = input_attr[0]->zero_point; - if (U8 == input1_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[1]->quant) { - input1_scale = input_attr[1]->asymm.scale; - input1ZP = input_attr[1]->asymm.zero_point; - } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[1]->quant) { - src1FixPointPos = input_attr[1]->dfp.fl; - if (src1FixPointPos >= 0) { - input1_scale = 1.0f / (float)((int64_t)1 << src1FixPointPos); - } else if (src1FixPointPos < 0) { - input1_scale = (float)((int64_t)1 << -src1FixPointPos); - } - input1ZP = 0; - } else { - input1_scale = 1.0f; - input1ZP = 0; - } - - if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) { - output_scale = output_attr->asymm.scale; - outputZP = output_attr->asymm.zero_point; - } else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) { - dstFixPointPos = output_attr->dfp.fl; - if (dstFixPointPos >= 0) { - output_scale = (float)((int64_t)1 << dstFixPointPos); - } else if (dstFixPointPos < 0) { - output_scale = 1.0f / (float)((int64_t)1 << -dstFixPointPos); - } - outputZP = 0; - } else { - output_scale = 1.0; - outputZP = 0; - } + input1_scale = input_attr[1]->scale; + input1ZP = input_attr[1]->zero_point; + output_scale = output_attr->scale; + outputZP = output_attr->zero_point; in0_width = (uint32_t)(in0_shape->data[0]); in0_height = (uint32_t)(in0_shape->data[1]); @@ -496,7 +453,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) I16 == output_dtype)) || ((I8 == input0_dtype && I8 == input1_dtype && I8 == output_dtype))) { - float dfpScale = input0_scale * output_scale; + float dfpScale = input0_scale / output_scale; gpu_dp_inst_t uniDFPtoFp32_part0_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c index 1218322..8aef159 100644 --- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c @@ -179,7 +179,6 @@ DEF_KERNEL_INITIALIZER(_clip_initializer) / gpu_param.global_scale[1]); gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; - if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) { srcFixPointPos = input_attr->dfp.fl; diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c index bc5e267..52d610c 100644 --- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) #include #include #include @@ -319,41 +320,10 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer) out_shape = attr[2]->shape; - if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[0]->dfp.fl; - if (fl > 0) - { - input0Scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input0Scale = (float)((int64_t)1 << -fl); - } - } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - input0Scale = attr[0]->asymm.scale; - input0Tail = 0 - attr[0]->asymm.zero_point * input0Scale; - } - - if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[1]->dfp.fl; - if (fl > 0) - { - input1Scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input1Scale = (float)((int64_t)1 << -fl); - } - } - else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - input1Scale = attr[1]->asymm.scale; - input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale; - } + input0Scale = attr[0]->scale; + input0Tail = 0 - attr[0]->zero_point * input0Scale; + input1Scale = attr[1]->scale; + input1Tail = 0 - attr[1]->zero_point * input1Scale; gpu_param.global_scale[0] = 8; gpu_param.global_scale[1] = 1; @@ -616,3 +586,4 @@ final: REGISTER_BACKEND_EVIS( relational_ops, _setup ) __END_DECLS +#endif diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c index e5669b0..f4fec0b 100644 --- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c @@ -152,23 +152,12 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer) out_shape = output_attr->shape; weight_shape = weights_attr->shape; - if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) - { - input_ZP = input_attr->asymm.zero_point; - scaleIn = input_attr->asymm.scale; - } - - if ( VSI_NN_KERNEL_QUANT_ASYMM == weights_attr->quant ) - { - weight_ZP = weights_attr->asymm.zero_point; - scaleWights = weights_attr->asymm.scale; - } - - if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) - { - output_ZP = (float)output_attr->asymm.zero_point; - scaleOut = output_attr->asymm.scale; - } + input_ZP = input_attr->zero_point; + scaleIn = input_attr->scale; + weight_ZP = weights_attr->zero_point; + scaleWights = weights_attr->scale; + output_ZP = (float)output_attr->zero_point; + scaleOut = output_attr->scale; scaleOut = (scaleIn * scaleWights) / scaleOut; input_height = (int32_t)(in_shape->data[1]); diff --git a/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c new file mode 100644 index 0000000..012c040 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c @@ -0,0 +1,540 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_dtype_util.h" + +__BEGIN_DECLS + +typedef enum _crop_and_resize_type_e +{ + nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR, + bilinear = VSI_NN_INTERPOLATION_BILINEAR, +}crop_and_resize_type_e; + +#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME "crop_and_resize_" + +// Add kernel hashtable here +#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \ + (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD)) +#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \ + { CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \ + CVIVANTE_NAMESPACE("evis.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \ + _CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD } + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _crop_and_resize_kernel_map[] = +{ + // Register kernel here + CROP_AND_RESIZE_KERNEL( U8, U8, nearest_neighbor ), + CROP_AND_RESIZE_KERNEL( U8, F16, nearest_neighbor ), + CROP_AND_RESIZE_KERNEL( F16, F16, nearest_neighbor), + CROP_AND_RESIZE_KERNEL( F16, U8, nearest_neighbor ), + CROP_AND_RESIZE_KERNEL( F16, I8, nearest_neighbor), + CROP_AND_RESIZE_KERNEL( I8, I8, nearest_neighbor ), + CROP_AND_RESIZE_KERNEL( I8, F16, nearest_neighbor), + CROP_AND_RESIZE_KERNEL( I16, I16, nearest_neighbor ), + CROP_AND_RESIZE_KERNEL( I16, F16, nearest_neighbor), + + CROP_AND_RESIZE_KERNEL( U8, U8, bilinear), + CROP_AND_RESIZE_KERNEL( U8, F16, bilinear), + CROP_AND_RESIZE_KERNEL( F16, F16, bilinear), + CROP_AND_RESIZE_KERNEL( F16, U8, bilinear), + CROP_AND_RESIZE_KERNEL( F16, I8, bilinear), + CROP_AND_RESIZE_KERNEL( I8, I8, bilinear), + CROP_AND_RESIZE_KERNEL( I8, F16, bilinear), + CROP_AND_RESIZE_KERNEL( I16, I16, bilinear), + CROP_AND_RESIZE_KERNEL( I16, F16, bilinear), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _crop_and_resize_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CROP_AND_RESIZE_PARAM_NUM _cnt_of_array( _crop_and_resize_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_crop_and_resize_nearest_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + }; + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + int32_t crop_width = 0; + int32_t crop_height = 0; + int32_t image_width = 0; + int32_t image_height = 0; + int32_t batch_out = 0; + float width_scale = 0; + float height_scale = 0; + float src0ZP = 0; + float src0Scale = 1; + float dstZP = 0; + float dstScale = 1; + float inOutScale = 0; + float inOutTile = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out); + CHECK_STATUS_FAIL_GOTO(status, final ); + + src0Scale = attr[0]->scale; + src0ZP = (float)attr[0]->zero_point; + + dstScale = attr[1]->scale; + dstZP = (float)attr[1]->zero_point; + + inOutScale = src0Scale / dstScale; + inOutTile = dstZP - inOutScale * src0ZP; + + image_width = (int32_t)(attr[0]->shape->data[0]); + image_height = (int32_t)(attr[0]->shape->data[1]); + crop_width = (int32_t)(attr[1]->shape->data[0]); + crop_height = (int32_t)(attr[1]->shape->data[1]); + + width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0; + height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0; + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 8); + gpu_param.global_size[1] = (crop_height + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = (batch_out + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + + CHECK_STATUS_FAIL_GOTO(status, final); + { + gpu_dp_inst_t uniExtract8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile ); + status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width ); + status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height ); + CHECK_STATUS_FAIL_GOTO(status, final); + } + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _crop_and_resize_nearest_initializer() */ + +DEF_KERNEL_INITIALIZER(_crop_and_resize_bilinear_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + }; + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + int32_t crop_width = 0; + int32_t crop_height = 0; + int32_t image_width = 0; + int32_t image_height = 0; + int32_t batch_out = 0; + float width_scale = 0; + float height_scale = 0; + float src0ZP = 0; + float src0Scale = 1; + float dstZP = 0; + float dstScale = 1; + float inOutScale = 0; + float inOutTile = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out); + CHECK_STATUS_FAIL_GOTO(status, final ); + + src0Scale = attr[0]->scale; + src0ZP = (float)attr[0]->zero_point; + + dstScale = attr[1]->scale; + dstZP = (float)attr[1]->zero_point; + + inOutScale = src0Scale / dstScale; + inOutTile = dstZP - inOutScale * src0ZP; + + image_width = (int32_t)(attr[0]->shape->data[0]); + image_height = (int32_t)(attr[0]->shape->data[1]); + crop_width = (int32_t)(attr[1]->shape->data[0]); + crop_height = (int32_t)(attr[1]->shape->data[1]); + + width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0; + height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0; + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (crop_height + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = (batch_out + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + + CHECK_STATUS_FAIL_GOTO(status, final); + { + gpu_dp_inst_t uniExtract8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniRightToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00030001, 0x00070005, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniLeftToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, + 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, "uniRightToFp32_4x4", &uniRightToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFp32_4x4", &uniLeftToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile ); + status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width ); + status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height ); + CHECK_STATUS_FAIL_GOTO(status, final); + } + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _crop_and_resize_bilinear_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t resize_method + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map; + size_t kernel_map_size = _cnt_of_array( _crop_and_resize_kernel_map ); + vx_param_description_t * param_def = _crop_and_resize_kernel_param_def; + vx_kernel_initialize_f initializer = _crop_and_resize_nearest_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (resize_method == bilinear) + { + initializer = _crop_and_resize_bilinear_initializer; + } + key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _crop_and_resize_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2]; + uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3]; + float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" ); + int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" ); + + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + shapes[0][0] = inputs[0]->attr.size[0]; + shapes[0][1] = inputs[0]->attr.size[1]; + shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3]; + + shapes[1][0] = outputs[0]->attr.size[0]; + shapes[1][1] = outputs[0]->attr.size[1]; + shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3]; + + rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 ); + rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 ); + + if (rs_input == NULL || rs_output == NULL) + { + goto final; + } + + status = _query_kernel( kernel, inputs, outputs, resize_method ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + node_params[0] = rs_input; + node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t); + node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t); + node_params[3] = rs_output; + node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth ); + node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + } + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + vsi_nn_Float32ToDtype(extrapolation_value, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype); + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( crop_and_resize, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c index 9ed9c08..4660e89 100644 --- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c @@ -204,39 +204,11 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse); CHECK_STATUS_FAIL_GOTO(status, OnError ); - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - input_scale = attr[0]->asymm.scale; - input_zp = attr[0]->asymm.zero_point; - } + input_scale = attr[0]->scale; + input_zp = attr[0]->zero_point; - if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[1]->dfp.fl > 0) - { - output_scale = (float)((int64_t)1 << attr[1]->dfp.fl); - } - else - { - output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); - } - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - output_scale = 1.0f / attr[1]->asymm.scale; - output_zp = (float)attr[1]->asymm.zero_point; - } + output_scale = 1.0f / attr[1]->scale; + output_zp = (float)attr[1]->zero_point; in_out_scale = input_scale * output_scale; in_out_zp_scale = (float)in_out_scale * input_zp * (-1); diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c index 9d46462..b38d63c 100644 --- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c @@ -22,7 +22,7 @@ * *****************************************************************************/ - +#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT) #include #include #include @@ -161,51 +161,10 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size); CHECK_STATUS_FAIL_GOTO(status, OnError ); - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - src0ZP = attr[0]->asymm.zero_point; - src0Scale = attr[0]->asymm.scale; - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - src0ZP = 0; - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - src0Scale = 1; - src0ZP = 0; - } - - if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - dstZP = attr[1]->asymm.zero_point; - dstScale = attr[1]->asymm.scale; - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[1]->dfp.fl > 0) - { - dstScale = (1.0f / (float)((int64_t)1 << attr[1]->dfp.fl)); - } - else - { - dstScale = (float)((int64_t)1 << -attr[1]->dfp.fl); - } - dstZP = 0; - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - dstScale = 1; - dstZP = 0; - } + src0ZP = attr[0]->zero_point; + src0Scale = attr[0]->scale; + dstZP = attr[1]->zero_point; + dstScale = attr[1]->scale; output_dims = (uint32_t)attr[1]->shape->size; output_width = (int32_t)(attr[1]->shape->data[0]); @@ -454,4 +413,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( depth2space_internal, _setup ) - +#endif diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c index a2f10ce..0e4e1fe 100644 --- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c @@ -250,12 +250,12 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer) gpu_param.global_size[1] = gpu_align_p2((output_shape->data[1] + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1], gpu_param.local_size[1]); - outputScale = input_attr->asymm.scale; + outputScale = input_attr->scale; - outputScale *= weight_attr->asymm.scale; - weightZP = weight_attr->asymm.zero_point; - outputScale /= output_attr->asymm.scale; - outputZP = (float)output_attr->asymm.zero_point + 0.5f; + outputScale *= weight_attr->scale; + weightZP = weight_attr->zero_point; + outputScale /= output_attr->scale; + outputZP = (float)output_attr->zero_point + 0.5f; #define _PACK_SELECT_KEY( kernel_size, dilation, evis_version ) \ ((uint64_t)kernel_size | ((uint64_t)dilation << 16) | ((uint64_t)evis_version << 32)) diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c index aa781c8..05115ae 100644 --- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c @@ -135,17 +135,10 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) status = vsi_nn_kernel_gpu_add_param( node, "logE", &logE); CHECK_STATUS_FAIL_GOTO(status, final ); - if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) - { - input0_ZP = input_attr->asymm.zero_point; - scaleIn0 = input_attr->asymm.scale; - } - - if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant ) - { - input1_ZP = input1_attr->asymm.zero_point; - scaleIn1 = input1_attr->asymm.scale; - } + input0_ZP = input_attr->zero_point; + scaleIn0 = input_attr->scale; + input1_ZP = input1_attr->zero_point; + scaleIn1 = input1_attr->scale; if ((F32 == input_attr->dtype) || (F32 == input1_attr->dtype)) { diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c index 5d383a1..be27bdd 100644 --- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -60,6 +60,7 @@ typedef enum UNARY_ATANH, UNARY_ACOSH, UNARY_INVERSE_SIGMOID, + UNARY_TAN, } unary_type_e; /* @@ -108,6 +109,7 @@ typedef enum #define ATANH_OPERATION atanh #define ACOSH_OPERATION acosh #define INVERSE_SIGMOID_OPERATION inverse_sigmoid +#define TAN_OPERATION tan #define ADD_UNARY_SH_KERNELS(name, source) \ TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \ @@ -153,6 +155,7 @@ static const struct { ADD_UNARY_SH_KERNELS(ATAN, KERNEL_SOURCE1) ADD_UNARY_SH_KERNELS(ATANH, KERNEL_SOURCE1) ADD_UNARY_SH_KERNELS(ACOSH, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(TAN, KERNEL_SOURCE1) ADD_UNARY_SH_KERNELS(HSIGMOID, KERNEL_SOURCE0) ADD_UNARY_SH_KERNELS(MISH, KERNEL_SOURCE0) @@ -177,6 +180,7 @@ static const struct { #undef RCP_OPERATION #undef SIGN_OPERATION #undef SOFTSIGN_OPERATION +#undef TAN_OPERATION /* * Kernel params */ @@ -243,41 +247,10 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) } out_shape = attr[1]->shape; - if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[0]->dfp.fl; - if (fl > 0) - { - inputScale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - inputScale = (float)((int64_t)1 << -fl); - } - } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inputScale = attr[0]->asymm.scale; - inputTail = 0 - attr[0]->asymm.zero_point * inputScale; - } - - if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[1]->dfp.fl; - if (fl > 0) - { - outputScale = (float)((int64_t)1 << fl); - } - else - { - outputScale = (float)1.0f / (float) ((int64_t)1 << -fl); - } - } - else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - outputScale = (float)1.0f / attr[1]->asymm.scale; - outputZP = (float)attr[1]->asymm.zero_point; - } + inputScale = attr[0]->scale; + inputTail = 0 - attr[0]->zero_point * inputScale; + outputScale = (float)1.0f / attr[1]->scale; + outputZP = (float)attr[1]->zero_point; #define _PACK_SELECT_KEY( TYPE, IN_TYPE, OUT_TYPE ) \ (( TYPE << 24) | ( IN_TYPE << 16) | ( OUT_TYPE << 8)) @@ -298,17 +271,23 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) switch( pack_key ) { +#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT) case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ): +#endif +#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT) case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ): +#endif case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_SELU, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ): +#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT) case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ): +#endif case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ): @@ -317,6 +296,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) case _PACK_SELECT_KEY( UNARY_ATANH, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_ACOSH, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_INVERSE_SIGMOID, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_TAN, BF16, BF16 ): { gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ 0x11111111, // TCfg @@ -614,16 +594,22 @@ OnError: } \ REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup ) +#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS ) +#endif +#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP ) +#endif REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND ) +#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU ) +#endif REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP ) @@ -633,5 +619,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atan, UNARY_ATAN ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atanh, UNARY_ATANH ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( acosh, UNARY_ACOSH ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( inverse_sigmoid, UNARY_INVERSE_SIGMOID ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( tan, UNARY_TAN ) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/evis/erf_evis.c b/src/tim/vx/internal/src/kernel/evis/erf_evis.c index ebc8ad8..22c8712 100644 --- a/src/tim/vx/internal/src/kernel/evis/erf_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/erf_evis.c @@ -145,41 +145,10 @@ DEF_KERNEL_INITIALIZER(_erf_initializer) out_shape = attr[1]->shape; - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[0]->dfp.fl; - if (fl > 0) - { - inputScale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - inputScale = (float)((int64_t)1 << -fl); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inputScale = attr[0]->asymm.scale; - inputTail = 0 - attr[0]->asymm.zero_point * inputScale; - } - - if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[1]->dfp.fl; - if (fl > 0) - { - outputScale = (float)((int64_t)1 << fl); - } - else - { - outputScale = (float)1.0f / (float) ((int64_t)1 << -fl); - } - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - outputScale = (float)1.0f / attr[1]->asymm.scale; - outputZP = (float)attr[1]->asymm.zero_point; - } + inputScale = attr[0]->scale; + inputTail = 0 - (float)attr[0]->zero_point * inputScale; + outputScale = (float)1.0f / attr[1]->scale; + outputZP = (float)attr[1]->zero_point; #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \ ( ( IN_TYPE << 16) | ( OUT_TYPE << 8)) diff --git a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c index 86d4d58..d12998d 100644 --- a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c @@ -129,9 +129,6 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; vsi_nn_kernel_dtype_e input0_dtype = F16; - int32_t input0_fl = 0; - int32_t input1_fl = 0; - int32_t output_fl = 0; float inScale0 = 1.0f; float inScale1 = 1.0f; float outScale = 1.0f; @@ -169,59 +166,12 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer) (output_shape->data[2] + gpu_param.global_scale[2] - 1) / gpu_param.global_scale[2] : 1; - if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - input0_fl = input0_attr->dfp.fl; - if (input0_fl > 0) - { - inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl); - } - else - { - inScale0 = (float)((int64_t)1 << -input0_fl); - } - } - else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inScale0 = input0_attr->asymm.scale; - in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point); - } - - if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - input1_fl = input1_attr->dfp.fl; - if (input1_fl > 0) - { - inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl); - } - else - { - inScale1 = (float)((int64_t)1 << -input1_fl); - } - } - else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inScale1 = input1_attr->asymm.scale; - in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point); - } - - if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - output_fl = output_attr->dfp.fl; - if (output_fl > 0) - { - outScale = (float) ((int64_t)1 << output_fl); - } - else - { - outScale = 1.0f / (float)((int64_t)1 << -output_fl); - } - } - else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - outScale = 1.0f / output_attr->asymm.scale; - outZp = (float)(output_attr->asymm.zero_point); - } + inScale0 = input0_attr->scale; + in0Tail = 0 - inScale0 * ((float)input0_attr->zero_point); + inScale1 = input1_attr->scale; + in1Tail = 0 - inScale1 * ((float)input1_attr->zero_point); + outScale = 1.0f / output_attr->scale; + outZp = (float)(output_attr->zero_point); if (BF16 == input0_dtype) { diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index cf4411e..c61565c 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -22,7 +22,7 @@ * *****************************************************************************/ - +#if !(VX_TENSOR_GATHER_API_SUPPORT) #include #include #include @@ -202,6 +202,7 @@ static vx_param_description_t _gather_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; #define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def ) @@ -285,6 +286,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) int32_t indices_num = 1; uint32_t input_dims1 = 0; int32_t batch = 1; + int32_t is_array = 0; vx_uint32 i = 0; vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; vsi_size_array_t * input1_shape = NULL; @@ -308,40 +310,13 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num); CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &is_array); + CHECK_STATUS_FAIL_GOTO(status, OnError ); - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - src0Scale = attr[0]->asymm.scale; - src0ZP = attr[0]->asymm.zero_point; - } - - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[2]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - dstScale = 1.0f / attr[2]->asymm.scale; - dstZP = attr[2]->asymm.zero_point; - } + src0Scale = attr[0]->scale; + src0ZP = attr[0]->zero_point; + dstScale = 1.0f / attr[2]->scale; + dstZP = attr[2]->zero_point; input1_shape = attr[1]->shape; input_dims1 = (uint32_t)input1_shape->size; @@ -358,8 +333,16 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) } shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; - shaderParam.global_size[0] = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1) - / shaderParam.global_scale[0], 4); + if (is_array) + { + shaderParam.global_size[0] = (block_size + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0]; + } + else + { + shaderParam.global_size[0] = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + } shaderParam.global_size[1] = indices_num; shaderParam.global_size[2] = block_num; @@ -508,39 +491,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num); CHECK_STATUS_FAIL_GOTO(status, OnError ); - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - src0Scale = attr[0]->asymm.scale; - src0ZP = attr[0]->asymm.zero_point; - } - - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[2]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - dstScale = 1.0f / attr[2]->asymm.scale; - dstZP = attr[2]->asymm.zero_point; - } + src0Scale = attr[0]->scale; + src0ZP = attr[0]->zero_point; + dstScale = 1.0f / attr[2]->scale; + dstZP = attr[2]->zero_point; input1_shape = attr[1]->shape; input_dims1 = (uint32_t)input1_shape->size; @@ -661,8 +615,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) { status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch); } - status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); - status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder); + if (indices_num > GPU_TENSOR_MAX_WIDTH || block_num > GPU_TENSOR_MAX_WIDTH) + { + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder); + } CHECK_STATUS_FAIL_GOTO(status, OnError ); OnError: @@ -841,6 +798,7 @@ static vsi_nn_kernel_node_t _setup tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is_array ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM ); vsi_nn_kernel_scalar_release( &tmp_params[3] ); vsi_nn_kernel_scalar_release( &tmp_params[4] ); @@ -859,3 +817,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( gather, _setup ) +#endif diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c index 91c8f17..1d829fd 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c @@ -290,39 +290,10 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size); CHECK_STATUS_FAIL_GOTO(status, OnError ); - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - src0Scale = attr[0]->asymm.scale; - src0ZP = attr[0]->asymm.zero_point; - } - - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[2]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - dstScale = 1.0f / attr[2]->asymm.scale; - dstZP = attr[2]->asymm.zero_point; - } + src0Scale = attr[0]->scale; + src0ZP = attr[0]->zero_point; + dstScale = 1.0f / attr[2]->scale; + dstZP = attr[2]->zero_point; indices_num = (int32_t)(attr[1]->shape->data[1]); batch_num = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1); diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c index 631cfd9..3cf2829 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c @@ -238,7 +238,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer) float tensorZP[4] = {0.0f, 0.0f, 0.0f, 0.0f}; uint32_t i = 0; uint32_t pack_key = 0; - vsi_size_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL }; VSI_UNREFERENCED(param_size); @@ -254,12 +254,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer) for (i = 0; i < 4; i++) { - if( attr[i]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[i]->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - tensorZP[i] = (float)attr[i]->asymm.zero_point; - tensorScale[i] = attr[i]->asymm.scale; - } + tensorZP[i] = (float)attr[i]->zero_point; + tensorScale[i] = attr[i]->scale; } tensorZP[0] = tensorScale[0] * tensorZP[0]; @@ -459,63 +455,31 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer) output_shape = attr[3]->shape; - if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - input_scale = attr[0]->asymm.scale; - input_tail = 0 - input_scale * (float)attr[0]->asymm.zero_point; - } + input_scale = attr[0]->scale; + input_tail = 0 - input_scale * (float)attr[0]->zero_point; - if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - input_r_scale = attr[1]->asymm.scale; - input_r_tail = 0 - input_r_scale * (float)attr[1]->asymm.zero_point; - } + input_r_scale = attr[1]->scale; + input_r_tail = 0 - input_r_scale * (float)attr[1]->zero_point; - if( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - recur_r_scale = attr[2]->asymm.scale; - recur_r_tail = 0 - recur_r_scale * (float)attr[2]->asymm.zero_point; - } + recur_r_scale = attr[2]->scale; + recur_r_tail = 0 - recur_r_scale * (float)attr[2]->zero_point; - if( attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[3]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - output_scale = 1.0f / attr[3]->asymm.scale; - output_zp = (float)attr[3]->asymm.zero_point; - } + output_scale = 1.0f / attr[3]->scale; + output_zp = (float)attr[3]->zero_point; if ( param_size == _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM ) { - if( attr[4]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[4]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - input_z_scale = attr[4]->asymm.scale; - input_z_tail = 0 - input_z_scale * (float)attr[4]->asymm.zero_point; - } + input_z_scale = attr[4]->scale; + input_z_tail = 0 - input_z_scale * (float)attr[4]->zero_point; - if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - recur_z_scale = attr[5]->asymm.scale; - recur_z_tail = 0 - recur_z_scale * (float)attr[5]->asymm.zero_point; - } + recur_z_scale = attr[5]->scale; + recur_z_tail = 0 - recur_z_scale * (float)attr[5]->zero_point; - if( attr[6]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[6]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - input_c_scale = attr[6]->asymm.scale; - input_c_tail = 0 - input_c_scale * (float)attr[6]->asymm.zero_point; - } + input_c_scale = attr[6]->scale; + input_c_tail = 0 - input_c_scale * (float)attr[6]->zero_point; - if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - recur_c_scale = attr[7]->asymm.scale; - recur_c_tail = 0 - recur_c_scale * (float)attr[7]->asymm.zero_point; - } + recur_c_scale = attr[7]->scale; + recur_c_tail = 0 - recur_c_scale * (float)attr[7]->zero_point; } if (layer_out == 1 || layer_out == 2) diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c index 63360b4..b4b7b61 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c @@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer) float hstate_in_tail = 0; float output_scale = 1.0f; float output_zp = 0; + float output_scale1 = 1.0f; + float output_zp1 = 0; uint32_t i = 0; uint32_t pack_key = 0; vsi_nn_kernel_tensor_attr_t* input_attr[GRUCELL_ACT_Z_H_IN_CNT] = {NULL}; @@ -142,33 +144,14 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer) output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out ); CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final ); - if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant ) - { - int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl; - if (srcFixPointPos >= 0) - hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); - else if (srcFixPointPos < 0) - hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos); - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant ) - { - hstate_in_scale = input_attr[0]->asymm.scale; - hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale; - } + hstate_in_scale = input_attr[0]->scale; + hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale; - if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant ) - { - int8_t dstFixPointPos = (int8_t)output_attr[0]->dfp.fl; - if (dstFixPointPos >= 0) - output_scale *= (vx_float32)((int64_t)1 << dstFixPointPos); - else if (dstFixPointPos < 0) - output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - dstFixPointPos); - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant ) - { - output_scale = 1.0f / output_attr[0]->asymm.scale; - output_zp = (float)output_attr[0]->asymm.zero_point; - } + output_scale = 1.0f / output_attr[0]->scale; + output_zp = (float)output_attr[0]->zero_point; + + output_scale1 = 1.0f / output_attr[1]->scale; + output_zp1 = (float)output_attr[1]->zero_point; pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype); @@ -290,6 +273,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail); status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale1", &output_scale1); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp1", &output_zp1); CHECK_STATUS_FAIL_GOTO(status, final ); } break; diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c index e3a2899..a4e885a 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c @@ -132,19 +132,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer) output_attr[0] = vsi_nn_kernel_tensor_attr_create( output ); CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final ); - if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant ) - { - int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl; - if (srcFixPointPos >= 0) - hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); - else if (srcFixPointPos < 0) - hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos); - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant ) - { - hstate_in_scale = input_attr[0]->asymm.scale; - hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale; - } + hstate_in_scale = input_attr[0]->scale; + hstate_in_tail = 0 - (float)input_attr[0]->zero_point * hstate_in_scale; pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype); diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c index f53a56a..e281d9a 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c @@ -47,6 +47,7 @@ typedef enum _grucell_nn_activation_type_e SIGMOID = VSI_NN_ACT_SIGMOID, HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, TANH = VSI_NN_ACT_TANH, + RELU = VSI_NN_ACT_RELU, }grucell_nn_activation_type_e; #define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation" @@ -80,6 +81,11 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] = PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ), PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ), PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, SIGMOID ), + PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, RELU ), + PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, RELU ), + PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, RELU ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, RELU ), + PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, RELU ), }; @@ -148,33 +154,11 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer) output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out ); CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final ); - if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant ) - { - int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl; - if (srcFixPointPos >= 0) - hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); - else if (srcFixPointPos < 0) - hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos); - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant ) - { - hstate_in_scale = input_attr[0]->asymm.scale; - hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale; - } + hstate_in_scale = input_attr[0]->scale; + hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale; - if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant ) - { - int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl; - if (srcFixPointPos >= 0) - output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos); - else if (srcFixPointPos < 0) - output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos); - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant ) - { - output_scale = 1.0f / output_attr[0]->asymm.scale; - output_zp = (float)output_attr[0]->asymm.zero_point; - } + output_scale = 1.0f / output_attr[0]->scale; + output_zp = (float)output_attr[0]->zero_point; pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype); diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c index 068257c..f55891f 100644 --- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c @@ -127,10 +127,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) vsi_size_array_t * output_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; - int32_t input_fl = 0; int32_t inputZP = 0; float inputScale = 1.0f; - int32_t output_fl = 0; int32_t outputZP = 0; float outputScale = 1.0f; float r_inputScale = 1.0f; @@ -153,41 +151,11 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) input_dtype = input_attr->dtype; output_dtype = output_attr->dtype; - if ( VSI_NN_KERNEL_QUANT_DFP == input_attr->quant ) - { - input_fl = input_attr->dfp.fl; - if (input_fl >= 0) - { - inputScale = 1.0f / (float) ((int64_t)1 << input_fl); - } - else - { - inputScale = (float) ((int64_t)1 << -input_fl); - } - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) - { - inputZP = input_attr->asymm.zero_point; - inputScale = input_attr->asymm.scale; - } + inputZP = input_attr->zero_point; + inputScale = input_attr->scale; - if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant ) - { - output_fl = output_attr->dfp.fl; - if (output_fl >= 0) - { - outputScale = (float) ((int64_t)1 << output_fl); - } - else - { - outputScale = 1.0f / (float) ((int64_t)1 << -output_fl); - } - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) - { - outputZP = output_attr->asymm.zero_point; - outputScale = 1.0f / output_attr->asymm.scale; - } + outputZP = output_attr->zero_point; + outputScale = 1.0f / output_attr->scale; e2InScale = inputScale * inputScale; r_inputScale = 1.0f / inputScale; diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c index 0a477c5..5ecb4b7 100644 --- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) #include #include #include @@ -42,7 +43,11 @@ __BEGIN_DECLS #define SOURCE_AXIS0_1 "layer_normalization_1" #define SOURCE_AXIS0_2 "layer_normalization_2" #define SOURCE_AXIS0_3 "layer_normalization_3" -#define SOURCE_AXIS01 "layer_normalization_axis01" +#define SOURCE_AXIS01_SUM "layer_normalization_axis01_sum" +#define SOURCE_AXIS01_0 "layer_normalization_axis01_0" +#define SOURCE_AXIS01_1 "layer_normalization_axis01_1" +#define SOURCE_AXIS01_2 "layer_normalization_axis01_2" +#define SOURCE_AXIS01_3 "layer_normalization_axis01_3" #define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, SCALE_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.layer_norm_axis0_"#SRC0_TYPE"_"#SCALE_TYPE"to"#DST_TYPE) @@ -88,15 +93,15 @@ __BEGIN_DECLS #define HASH_LN_AXIS01_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.layernorm_axis01_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE) -#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE) \ +#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ { HASH_LAYERNORM_KEY(IN0_TYPE, U4, OUT_TYPE, 0), \ HASH_LN_AXIS01_SUMS_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ - SOURCE_AXIS01 }, + SOURCE }, -#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ +#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ { HASH_LAYERNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ HASH_LN_AXIS01_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ - SOURCE_AXIS01 }, + SOURCE }, typedef struct { @@ -159,32 +164,32 @@ static const _kernel_map_type _layernorm_kernel_map[] = static const _kernel_map_type _layernorm_axis01_kernel_map[] = { // Register kernel here - LN_AXIS01_SUMS_KERNELS( I8, F32 ) - LN_AXIS01_SUMS_KERNELS( U8, F32 ) - LN_AXIS01_SUMS_KERNELS( F16, F32 ) - LN_AXIS01_SUMS_KERNELS( I16, F32 ) + LN_AXIS01_SUMS_KERNELS( I8, F32, SOURCE_AXIS01_SUM ) + LN_AXIS01_SUMS_KERNELS( U8, F32, SOURCE_AXIS01_SUM ) + LN_AXIS01_SUMS_KERNELS( F16, F32, SOURCE_AXIS01_SUM ) + LN_AXIS01_SUMS_KERNELS( I16, F32, SOURCE_AXIS01_SUM ) - LAYERNORM_AXIS01_KERNELS( U8, F16, U8 ) - LAYERNORM_AXIS01_KERNELS( U8, F16, F16 ) - LAYERNORM_AXIS01_KERNELS( I8, F16, I8 ) - LAYERNORM_AXIS01_KERNELS( I8, F16, F16 ) - LAYERNORM_AXIS01_KERNELS( F16, F16, F16 ) - LAYERNORM_AXIS01_KERNELS( F16, F16, I16 ) - LAYERNORM_AXIS01_KERNELS( F16, F16, I8 ) - LAYERNORM_AXIS01_KERNELS( F16, F16, U8 ) - LAYERNORM_AXIS01_KERNELS( I16, F16, I16 ) - LAYERNORM_AXIS01_KERNELS( I16, F16, F16 ) + LAYERNORM_AXIS01_KERNELS( U8, F16, U8, SOURCE_AXIS01_0 ) + LAYERNORM_AXIS01_KERNELS( U8, F16, F16, SOURCE_AXIS01_0 ) + LAYERNORM_AXIS01_KERNELS( I8, F16, I8, SOURCE_AXIS01_1 ) + LAYERNORM_AXIS01_KERNELS( I8, F16, F16, SOURCE_AXIS01_1 ) + LAYERNORM_AXIS01_KERNELS( F16, F16, F16, SOURCE_AXIS01_2 ) + LAYERNORM_AXIS01_KERNELS( F16, F16, I16, SOURCE_AXIS01_2 ) + LAYERNORM_AXIS01_KERNELS( F16, F16, I8, SOURCE_AXIS01_2 ) + LAYERNORM_AXIS01_KERNELS( F16, F16, U8, SOURCE_AXIS01_2 ) + LAYERNORM_AXIS01_KERNELS( I16, F16, I16, SOURCE_AXIS01_3 ) + LAYERNORM_AXIS01_KERNELS( I16, F16, F16, SOURCE_AXIS01_3 ) - LAYERNORM_AXIS01_KERNELS( U8, F32, U8 ) - LAYERNORM_AXIS01_KERNELS( U8, F32, F16 ) - LAYERNORM_AXIS01_KERNELS( I8, F32, I8 ) - LAYERNORM_AXIS01_KERNELS( I8, F32, F16 ) - LAYERNORM_AXIS01_KERNELS( F16, F32, F16 ) - LAYERNORM_AXIS01_KERNELS( F16, F32, I16 ) - LAYERNORM_AXIS01_KERNELS( F16, F32, I8 ) - LAYERNORM_AXIS01_KERNELS( F16, F32, U8 ) - LAYERNORM_AXIS01_KERNELS( I16, F32, I16 ) - LAYERNORM_AXIS01_KERNELS( I16, F32, F16 ) + LAYERNORM_AXIS01_KERNELS( U8, F32, U8, SOURCE_AXIS01_0 ) + LAYERNORM_AXIS01_KERNELS( U8, F32, F16, SOURCE_AXIS01_0 ) + LAYERNORM_AXIS01_KERNELS( I8, F32, I8, SOURCE_AXIS01_1 ) + LAYERNORM_AXIS01_KERNELS( I8, F32, F16, SOURCE_AXIS01_1 ) + LAYERNORM_AXIS01_KERNELS( F16, F32, F16, SOURCE_AXIS01_2 ) + LAYERNORM_AXIS01_KERNELS( F16, F32, I16, SOURCE_AXIS01_2 ) + LAYERNORM_AXIS01_KERNELS( F16, F32, I8, SOURCE_AXIS01_2 ) + LAYERNORM_AXIS01_KERNELS( F16, F32, U8, SOURCE_AXIS01_2 ) + LAYERNORM_AXIS01_KERNELS( I16, F32, I16, SOURCE_AXIS01_3 ) + LAYERNORM_AXIS01_KERNELS( I16, F32, F16, SOURCE_AXIS01_3 ) }; @@ -1165,3 +1170,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( layer_norm, _setup ) +#endif diff --git a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c index 4e7b8a0..37fddea 100644 --- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_LOGSOFTMAX_VX_SUPPORT) #include #include #include @@ -34,15 +35,21 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" __BEGIN_DECLS #define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + #define HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) \ "log_softmax_axis"#_suffix + #define HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) \ + "log_softmax_exceed_axis"#_suffix + + #define HASH_LOG_SOFTMAX_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \ { HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ @@ -53,11 +60,18 @@ __BEGIN_DECLS CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) }, -static const struct { +#define HASH_LOG_SOFTMAX_EXCEED_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \ + { HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.log_softmax_exceed_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \ + HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) }, + +typedef struct { uint32_t key; char* function_name; const char* source_name; - } _log_softmax_evis_kernel_map[] = + } _kernel_map_type; + +static const _kernel_map_type _log_softmax_evis_kernel_map[] = { HASH_LOG_SOFTMAX_KERNELS(0, F16, F16, 0) HASH_LOG_SOFTMAX_KERNELS(0, F16, I16, 0) @@ -126,6 +140,49 @@ static const struct { }; +static const _kernel_map_type _log_softmax_exceed_evis_kernel_map[] = +{ + + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, F16, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, I16, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, U8, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16, I8, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16, I16, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16, F16, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16, 0_BF16) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F32, 0_BF16) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F16, 0_BF16) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, U8, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, F16, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8, I8, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8, F16, 0) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, F16, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, I16, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, U8, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16, I8, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16, I16, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16, F16, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16, 1_BF16) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F32, 1_BF16) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F16, 1_BF16) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, U8, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, F16, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8, I8, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8, F16, 1) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, F16, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, I16, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, U8, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16, I8, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16, I16, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16, F16, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, BF16, BF16, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8, U8, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8, F16, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8, I8, 2) + HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8, F16, 2) + +}; + static vx_param_description_t kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -133,7 +190,9 @@ static vx_param_description_t kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; -#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + + +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) #define SCALAR_INPUT_AXIS (2) #define SCALAR_INPUT_BETA (3) @@ -157,7 +216,7 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer) float beta = 0; float input_scale = 0; float output_scale = 0; - int32_t outputZP = 0; + float outputZP = 0; uint32_t inputWidth = 0; uint32_t inputWidthRemain4 = 0; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; @@ -385,62 +444,25 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer) } } + outputZP = (float)attr[1]->zero_point; + output_scale = 1.0f / (float)(attr[1]->scale); + if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) { - int32_t fl = attr[1]->dfp.fl; - - if (fl > 0) - { - output_scale = (float)((int64_t)1 << fl); - } - else - { - output_scale = (float)1.0f / (float) ((int64_t)1 << -fl); - } - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &output_scale ); CHECK_STATUS_FAIL_GOTO(status, final ); } else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - float output_offset_asymmetric = 0; - outputZP = attr[1]->asymm.zero_point; - output_scale = 1.0f / (float)(attr[1]->asymm.scale); - output_offset_asymmetric = (float)outputZP; - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &output_scale ); status |= vsi_nn_kernel_gpu_add_param( node, - "output_offset_asymmetric", &output_offset_asymmetric ); + "output_offset_asymmetric", &outputZP ); CHECK_STATUS_FAIL_GOTO(status, final ); } - else - { - output_scale = 1; - outputZP = 0; - } - if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[0]->dfp.fl; - if (fl > 0) - { - input_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input_scale = (float)((int64_t)1 << -fl); - } - } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - input_scale = attr[0]->asymm.scale; - } - else - { - input_scale = 1.0f; - } + input_scale = attr[0]->scale; scaleLogE = scaleLogE * input_scale; beta = beta * input_scale; @@ -471,6 +493,296 @@ final: return status; } /* _log_softmax_initializer() */ +DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + int32_t axis = 0; + float beta = 0; + float input_scale = 0; + float output_scale = 0; + float outputZP = 0; + uint32_t inputWidth = 0; + uint32_t inputWidthRemain4 = 0; + int32_t width = 0; + int32_t height = 0; + int32_t depth = 0; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; + vsi_size_array_t * output_shape = NULL; + float logE = (float)(log10(exp(1.0f)) / log10(2.0f)); + float rlogE = (float)(log10(2.0f) / log10(exp(1.0f))); + float scaleLogE = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &beta); + CHECK_STATUS_FAIL_GOTO(status, final ); + + scaleLogE = logE * beta; + + output_shape = attr[1]->shape; + width = (int32_t)output_shape->data[0]; + height = (int32_t)output_shape->data[1]; + depth = output_shape->size > 2 ? (int32_t)output_shape->data[2] : 1; + gpu_param.dim = 2; + switch (axis) + { + case 0: + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = 1; + gpu_param.global_size[1] = depth; + break; + case 1: + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = + gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = 1; + break; + default: + break; + } + + { + gpu_dp_inst_t uniGetSubData0to3_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniGetSubData4to7_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPackMaxData_2x8 = {{ + 0x00000111, // TCfg + 0x00000000, // ASelt + 0x00050300, 0x00000000, // ABin + 0x00000222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00004400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf4_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGetSubLoData_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGetSubHiData_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00550044, 0x00770066, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + switch( axis ) + { + case 0: + { + inputWidth = (uint32_t)(output_shape->data[axis] / 4 * 4); + inputWidthRemain4 = (uint32_t)(output_shape->data[axis] % 4); + + status = vsi_nn_kernel_gpu_add_param( node, + "inputWidth", &inputWidth ); + status |= vsi_nn_kernel_gpu_add_param( node, + "inputWidthRemain4", &inputWidthRemain4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniPackMaxData_2x8", &uniPackMaxData_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &width ); + status |= vsi_nn_kernel_gpu_add_param( node, "height", &height); + if (attr[0]->dtype == BF16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractHalf4_4x4", &uniExtractHalf4_4x4 ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetSubData0to3_4x4", &uniGetSubData0to3_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetSubData4to7_4x4", &uniGetSubData4to7_4x4 ); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case 1: + { + if (attr[0]->dtype == BF16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + } + else + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetSubLoData_4x4", &uniGetSubLoData_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGetSubHiData_4x4", &uniGetSubHiData_4x4 ); + } + status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &height ); + status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + } + + outputZP = (float)attr[1]->zero_point; + output_scale = 1.0f / attr[1]->scale; + + if (attr[0]->dtype != BF16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "outputScale", &output_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "output_offset_asymmetric", &outputZP ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + input_scale = attr[0]->scale; + + scaleLogE = scaleLogE * input_scale; + beta = beta * input_scale; + + status |= vsi_nn_kernel_gpu_add_param( node, + "rlogE", &rlogE ); + status |= vsi_nn_kernel_gpu_add_param( node, + "betaValue", &beta ); + status |= vsi_nn_kernel_gpu_add_param( node, + "scaleLogE", &scaleLogE ); + + status |= vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + } + + return status; + +} + static vsi_status _query_kernel ( vsi_nn_tensor_t* const* const inputs, @@ -513,7 +825,51 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ -static vsi_nn_kernel_node_t _setup +static vsi_status _query_kernel_exceed + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t axis, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input_dtype; + vsi_nn_kernel_dtype_e output_dtype; + vsi_status status = VSI_FAILURE; + uint32_t key; + size_t i; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_LOG_SOFTMAX_HASH_KEY( axis, input_dtype, output_dtype, 0 ); + + for( i = 0; i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map); i ++ ) + { + if( _log_softmax_exceed_evis_kernel_map[i].key == key ) + { + break; + } + } + + if( i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _log_softmax_exceed_evis_kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _log_softmax_exceed_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _log_softmax_exceed_evis_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _log_softmax_exceed_evis_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} + +static vsi_nn_kernel_node_t _setup_not_exceed ( vsi_nn_graph_t * graph, vsi_nn_tensor_t ** inputs, @@ -528,7 +884,13 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL}; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; int32_t axis = 0; + int32_t new_axis = 0; + vsi_bool ret = vx_false_e; + uint32_t i = 0; float beta = 1.0f; VSI_UNREFERENCED(input_num); @@ -537,15 +899,31 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); beta = vsi_nn_kernel_param_get_float32(params, "beta"); - if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, - inputs[0]->attr.dim_num ) - || axis > 2) + ret = vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rank_in, &new_axis); + + if (ret) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[0], rank_in ); + } + else { return NULL; } - image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); - status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size, + reshape_tensors[0]->attr.dim_num ) + || new_axis > 2) + { + return NULL; + } + + image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1); + status = _query_kernel( inputs, outputs, new_axis, image_2d, kernel ); if( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -553,9 +931,9 @@ static vsi_nn_kernel_node_t _setup { /* Pass parameters to node. */ vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM, - inputs, 1, outputs, 1 ); + reshape_tensors, 1, &reshape_tensors[1], 1 ); node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( - graph, I32, &axis ); + graph, I32, &new_axis ); node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create( graph, F32, &beta ); @@ -565,10 +943,132 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_BETA] ); } } + + for (i = 0; i < 2; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + return node; } /* _setup() */ +static vsi_nn_kernel_node_t _setup_exceed + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; + int32_t axis = 0; + int32_t new_axis = 0; + vsi_bool ret = vx_false_e; + uint32_t i = 0; + float beta = 1.0f; + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + beta = vsi_nn_kernel_param_get_float32(params, "beta"); + + ret = vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rank_in, &new_axis); + + if (ret) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[0], rank_in ); + } + else + { + return NULL; + } + + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size, + reshape_tensors[0]->attr.dim_num ) + || new_axis > 1) + { + return NULL; + } + + status = _query_kernel_exceed(inputs, outputs, new_axis, kernel); + if( VSI_SUCCESS != status) + { + goto final; + } + + node = vsi_nn_kernel_create_node( graph, kernel ); + CHECK_PTR_FAIL_GOTO( node, "Create kernel fail.", final ); + if (node) + { + vsi_nn_kernel_node_pack_io(node_params, _EVIS_PARAM_NUM, + reshape_tensors, + input_num, + &reshape_tensors[1], + output_num); + node_params[2] = vsi_nn_kernel_scalar_create(graph, I32, &new_axis ); + node_params[3] = vsi_nn_kernel_scalar_create(graph, F32, &beta ); + + status = vsi_nn_kernel_node_pass_param( + node, node_params, _EVIS_PARAM_NUM); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + } + +final: + for (i = 0; i < 2; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + + return node; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_nn_kernel_node_t node = NULL; + vsi_size_t *input_size = inputs[0]->attr.size; + int32_t axis = 0; + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + if (input_size[axis] >= GPU_TENSOR_MAX_WIDTH) + { + node = _setup_exceed(graph, inputs, input_num, outputs, output_num, params, kernel); + } + else + { + node = _setup_not_exceed(graph, inputs, input_num, outputs, output_num, params, kernel); + } + + return node; +} + + __END_DECLS REGISTER_BACKEND_EVIS( log_softmax, _setup ) - +#endif diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c index 46ab93f..2ec1b1a 100644 --- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c @@ -996,18 +996,14 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer) float forget_bias = 0.0f; float outputScale = 1.0f; float outputZP = 0; - int32_t dstZP = 0; - float dstScale = 1.0f; vsi_nn_kernel_dtype_e cellFormat = F16; vsi_nn_kernel_dtype_e dstFormat = F16; - vsi_nn_kernel_quant_type_e dstQuantType = VSI_NN_KERNEL_QUANT_NONE; - int32_t dstFixPointPos = 0; - float logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); + float logE = (float)(log10(exp(1.0f)) / log10(2.0f)); float twoLogE = 2 * logE; uint32_t uint_min = 0xFBFFFFFF; uint32_t uint_max = 0x7BFFFFFF; - float float_min = *(vx_float32 *)&uint_min; - float float_max = *(vx_float32 *)&uint_max; + float float_min = *(float *)&uint_min; + float float_max = *(float *)&uint_max; float clip_Min_F[4] = {0}; float clip_Max_F[4] = {0}; uint32_t i = 0; @@ -1063,22 +1059,11 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer) status = vsi_nn_kernel_scalar_read_float32( (vsi_nn_kernel_scalar_t)param[param_size - 1], &forget_bias ); CHECK_STATUS_FAIL_GOTO(status, final ); - cellFormat = attr[0]->dtype; - dstFormat = attr[1]->dtype; + cellFormat = attr[0]->dtype; + dstFormat = attr[1]->dtype; - dstQuantType = attr[1]->quant; - - if ( VSI_NN_KERNEL_QUANT_DFP == dstQuantType ) - { - dstFixPointPos = (int8_t)attr[1]->dfp.fl; - } - else if ( VSI_NN_KERNEL_QUANT_ASYMM == dstQuantType ) - { - dstZP = attr[1]->asymm.zero_point; - dstScale = attr[1]->asymm.scale; - } - - outputZP = (vx_float32)dstZP; + outputScale = 1.0f / attr[1]->scale; + outputZP = (float)attr[1]->zero_point; gpu_param.global_scale[0] = 4; gpu_param.global_scale[1] = 1; @@ -1182,20 +1167,6 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer) 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16}; - if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP) - { - if (dstFixPointPos >= 0) - outputScale *= (vx_float32)((int64_t)1 << dstFixPointPos); - else if (dstFixPointPos < 0) - outputScale *= 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); - - outputZP = 0; - } - else if (dstQuantType == VSI_NN_KERNEL_QUANT_ASYMM) - { - outputScale = 1.0f / dstScale; - } - if ( cellFormat == F16 ) { status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4); diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c index 1b15caa..b643d9b 100644 --- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -288,67 +288,13 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &K); CHECK_STATUS_FAIL_GOTO(status, OnError ); - src0ZP = attr[0]->asymm.zero_point; - src0Scale = attr[0]->asymm.scale; - src1ZP = attr[1]->asymm.zero_point; - src1Scale = attr[1]->asymm.scale; - dstZP = (float)attr[2]->asymm.zero_point; - dstScale = attr[2]->asymm.scale; + src0ZP = attr[0]->zero_point; + src0Scale = attr[0]->scale; + src1ZP = attr[1]->zero_point; + src1Scale = attr[1]->scale; + dstZP = (float)attr[2]->zero_point; + dstScale = attr[2]->scale; - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - src0ZP = 0; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) - { - src0Scale = 1; - src0ZP = 0; - } - - if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[1]->dfp.fl > 0) - { - src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl))); - } - else - { - src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl)); - } - src1ZP = 0; - } - else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) - { - src1Scale = 1; - src1ZP = 0; - } - - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[2]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - dstScale = 1.0f / dstScale; - dstZP = 0.0f; - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - dstScale = 1; - dstZP = 0.0f; - } gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0); gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1); @@ -1266,67 +1212,12 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_cross_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &axis_size); CHECK_STATUS_FAIL_GOTO(status, OnError ); - src0ZP = attr[0]->asymm.zero_point; - src0Scale = attr[0]->asymm.scale; - src1ZP = attr[1]->asymm.zero_point; - src1Scale = attr[1]->asymm.scale; - dstZP = (float)attr[2]->asymm.zero_point; - dstScale = attr[2]->asymm.scale; - - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - src0ZP = 0; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) - { - src0Scale = 1; - src0ZP = 0; - } - - if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[1]->dfp.fl > 0) - { - src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl))); - } - else - { - src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl)); - } - src1ZP = 0; - } - else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) - { - src1Scale = 1; - src1ZP = 0; - } - - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[2]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - dstScale = 1.0f / dstScale; - dstZP = 0.0f; - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - dstScale = 1; - dstZP = 0.0f; - } + src0ZP = attr[0]->zero_point; + src0Scale = attr[0]->scale; + src1ZP = attr[1]->zero_point; + src1Scale = attr[1]->scale; + dstZP = (float)attr[2]->zero_point; + dstScale = attr[2]->scale; mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP); inOutScale = src0Scale * src1Scale / dstScale; diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c index d862eb7..4e319da 100644 --- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c @@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); out_shape = attr[2]->shape; - - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[0]->dfp.fl; - if (fl > 0) - { - input0_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input0_scale = (float)((int64_t)1 << -fl); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - input0_zp = attr[0]->asymm.zero_point; - input0_scale = attr[0]->asymm.scale; - } - - if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[1]->dfp.fl; - if (fl > 0) - { - input1_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input1_scale = (float)((int64_t)1 << -fl); - } - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - input1_zp = attr[1]->asymm.zero_point; - input1_scale = attr[1]->asymm.scale; - } - - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[2]->dfp.fl; - if (fl > 0) - { - output_scale = (float) ((int64_t)1 << fl); - } - else - { - output_scale = 1.0f / (float)((int64_t)1 << -fl); - } - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - output_zp = attr[2]->asymm.zero_point; - output_scale = 1.0f / attr[2]->asymm.scale; - } + input0_zp = attr[0]->zero_point; + input0_scale = attr[0]->scale; + input1_zp = attr[1]->zero_point; + input1_scale = attr[1]->scale; + output_zp = attr[2]->zero_point; + output_scale = 1.0f / attr[2]->scale; #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16)) @@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* tmp_inputs[2] = { NULL }; vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type; vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t new_rank = 0; + vsi_bool ret = TRUE; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(params); - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) + ret = vsi_nn_kernel_optimize_eltwise_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if (ret == FALSE) { - return NULL; + goto final; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[2], new_rank ); + + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size, + reshape_tensors[2]->attr.dim_num ) ) + { + goto final; } // Reorder tensor if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 ) { int32_t order[2] = {1, 0}; - vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs ); + vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs ); } else { - memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 ); + memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 ); } - image_2d = (outputs[0]->attr.dim_num == 2); - status = _query_kernel( tmp_inputs, outputs, image_2d, kernel ); + image_2d = (reshape_tensors[2]->attr.dim_num == 2); + status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel ); if ( VSI_SUCCESS == status ) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup { /* Pass parameters to node. */ vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM, - tmp_inputs, 2, outputs, 1 ); + tmp_inputs, 2, &reshape_tensors[2], 1 ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); } } + +final: + vsi_safe_release_tensor(reshape_tensors[0]); + vsi_safe_release_tensor(reshape_tensors[1]); + vsi_safe_release_tensor(reshape_tensors[2]); + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c index cb9fc35..a86d57a 100644 --- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c @@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); out_shape = attr[2]->shape; - - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[0]->dfp.fl; - if (fl > 0) - { - input0_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input0_scale = (float)((int64_t)1 << -fl); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - input0_zp = attr[0]->asymm.zero_point; - input0_scale = attr[0]->asymm.scale; - } - - if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[1]->dfp.fl; - if (fl > 0) - { - input1_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input1_scale = (float)((int64_t)1 << -fl); - } - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - input1_zp = attr[1]->asymm.zero_point; - input1_scale = attr[1]->asymm.scale; - } - - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[2]->dfp.fl; - if (fl > 0) - { - output_scale = (float) ((int64_t)1 << fl); - } - else - { - output_scale = 1.0f / (float)((int64_t)1 << -fl); - } - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - output_zp = attr[2]->asymm.zero_point; - output_scale = 1.0f / attr[2]->asymm.scale; - } + input0_zp = attr[0]->zero_point; + input0_scale = attr[0]->scale; + input1_zp = attr[1]->zero_point; + input1_scale = attr[1]->scale; + output_zp = attr[2]->zero_point; + output_scale = 1.0f / attr[2]->scale; #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16)) @@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* tmp_inputs[2] = { NULL }; vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type; vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t new_rank = 0; + vsi_bool ret = TRUE; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(params); - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) + ret = vsi_nn_kernel_optimize_eltwise_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if (ret == FALSE) { - return NULL; + goto final; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[2], new_rank ); + + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size, + reshape_tensors[2]->attr.dim_num ) ) + { + goto final; } // Reorder tensor if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 ) { int32_t order[2] = {1, 0}; - vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs ); + vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs ); } else { - memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 ); + memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 ); } - image_2d = (outputs[0]->attr.dim_num == 2); - status = _query_kernel( tmp_inputs, outputs, image_2d, kernel ); + image_2d = (reshape_tensors[2]->attr.dim_num == 2); + status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel ); if ( VSI_SUCCESS == status ) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup { /* Pass parameters to node. */ vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM, - tmp_inputs, 2, outputs, 1 ); + tmp_inputs, 2, &reshape_tensors[2], 1 ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); } } + +final: + vsi_safe_release_tensor(reshape_tensors[0]); + vsi_safe_release_tensor(reshape_tensors[1]); + vsi_safe_release_tensor(reshape_tensors[2]); + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/mod_evis.c b/src/tim/vx/internal/src/kernel/evis/mod_evis.c index 70188f6..a2e28bf 100644 --- a/src/tim/vx/internal/src/kernel/evis/mod_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/mod_evis.c @@ -128,9 +128,6 @@ DEF_KERNEL_INITIALIZER(_mod_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; vsi_nn_kernel_dtype_e input0_dtype = F16; - int32_t input0_fl = 0; - int32_t input1_fl = 0; - int32_t output_fl = 0; float inScale0 = 1.0f; float inScale1 = 1.0f; float outScale = 1.0f; @@ -168,59 +165,12 @@ DEF_KERNEL_INITIALIZER(_mod_initializer) (output_shape->data[2] + gpu_param.global_scale[2] - 1) / gpu_param.global_scale[2] : 1; - if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP) - { - input0_fl = input0_attr->dfp.fl; - if (input0_fl > 0) - { - inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl); - } - else - { - inScale0 = (float)((int64_t)1 << -input0_fl); - } - } - else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - inScale0 = input0_attr->asymm.scale; - in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point); - } - - if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP) - { - input1_fl = input1_attr->dfp.fl; - if (input1_fl > 0) - { - inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl); - } - else - { - inScale1 = (float)((int64_t)1 << -input1_fl); - } - } - else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - inScale1 = input1_attr->asymm.scale; - in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point); - } - - if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP) - { - output_fl = output_attr->dfp.fl; - if (output_fl > 0) - { - outScale = (float) ((int64_t)1 << output_fl); - } - else - { - outScale = 1.0f / (float)((int64_t)1 << -output_fl); - } - } - else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - outScale = 1.0f / output_attr->asymm.scale; - outZp = (float)(output_attr->asymm.zero_point); - } + inScale0 = input0_attr->scale; + in0Tail = 0 - inScale0 * ((float)input0_attr->zero_point); + inScale1 = input1_attr->scale; + in1Tail = 0 - inScale1 * ((float)input1_attr->zero_point); + outScale = 1.0f / output_attr->scale; + outZp = (float)(output_attr->zero_point); if (BF16 == input0_dtype) { diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c index 9dc6eae..18bb050 100644 --- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c @@ -239,76 +239,12 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); input_shape = attr[0]->shape; - - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - - input_zp = 0; - } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) - { - input_zp = 0; - scaleIn = 1; - } - - if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - output_ZP0 = (float)attr[1]->asymm.zero_point; - outputScale0 = 1.0f / attr[1]->asymm.scale; - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[1]->dfp.fl > 0) - { - outputScale0 = (float)((int64_t)1 << attr[1]->dfp.fl); - } - else - { - outputScale0 = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); - } - output_ZP0 = 0.0f; - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - outputScale0 = 1.0f; - output_ZP0 = 0.0f; - } - - if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - output_ZP1 = (float)attr[2]->asymm.zero_point; - outputScale1 = 1.0f / attr[2]->asymm.scale; - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[2]->dfp.fl > 0) - { - outputScale1 = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - outputScale1 = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - output_ZP1 = 0.0f; - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - outputScale1 = 1.0f; - output_ZP1 = 0.0f; - } + input_zp = attr[0]->zero_point; + scaleIn = attr[0]->scale; + output_ZP0 = (float)attr[1]->zero_point; + outputScale0 = 1.0f / attr[1]->scale; + output_ZP1 = (float)attr[2]->zero_point; + outputScale1 = 1.0f / attr[2]->scale; output_ZP[0] = output_ZP0; output_ZP[1] = output_ZP1; diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c index de2d35a..46eaa81 100644 --- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c @@ -160,16 +160,13 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer) in_shape = attr[0]->shape; depth = (int32_t)(attr[1]->shape->data[1]); input_dtype = attr[0]->dtype; + input_zp = attr[0]->zero_point; + scaleIn = attr[0]->scale; if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant) { srcFixPointPos = attr[0]->dfp.fl; } - else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } if (suffix_size == 1) { diff --git a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c index e45704f..55b7e59 100644 --- a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c @@ -155,41 +155,19 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer) input_shape = input_attr->shape; src_dtype = input_attr->dtype; dst_dtype = output_attr->dtype; + inputScale = input_attr->scale; + input_ZP = input_attr->zero_point; + outputScale = output_attr->scale; + output_ZP = output_attr->zero_point; if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { input_fl = input_attr->dfp.fl; - if (input_fl > 0) - { - inputScale = 1.0f / (float) ((int64_t)1 << input_fl); - } - else - { - inputScale = (float)((int64_t)1 << -input_fl); - } - } - else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inputScale = input_attr->asymm.scale; - input_ZP = input_attr->asymm.zero_point; } if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { output_fl = output_attr->dfp.fl; - if (output_fl > 0) - { - outputScale = 1.0f / (float) ((int64_t)1 << output_fl); - } - else - { - outputScale = (float)((int64_t)1 << -output_fl); - } - } - else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - outputScale = output_attr->asymm.scale; - output_ZP = output_attr->asymm.zero_point; } if ( ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c index 679526e..8492528 100644 --- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_TENSOR_POW_API_SUPPORT) #include #include #include @@ -158,64 +159,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); - out_shape = attr[2]->shape; - - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[0]->dfp.fl; - if (fl > 0) - { - input0_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input0_scale = (float)((int64_t)1 << -fl); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - input0_scale = attr[0]->asymm.scale; - input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale; - } - - if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[1]->dfp.fl; - if (fl > 0) - { - input1_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input1_scale = (float)((int64_t)1 << -fl); - } - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - input1_scale = attr[1]->asymm.scale; - input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale; - } - - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[2]->dfp.fl; - if (fl > 0) - { - output_scale = (float) ((int64_t)1 << fl); - } - else - { - output_scale = 1.0f / (float)((int64_t)1 << -fl); - } - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM - || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM ) - { - output_zp = (float)attr[2]->asymm.zero_point; - output_scale = 1.0f / attr[2]->asymm.scale; - } + out_shape = attr[2]->shape; + input0_scale = attr[0]->scale; + input0_tail = 0 - (float)attr[0]->zero_point * input0_scale; + input1_scale = attr[1]->scale; + input1_tail = 0 - (float)attr[1]->zero_point * input1_scale; + output_zp = (float)attr[2]->zero_point; + output_scale = 1.0f / attr[2]->scale; #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16)) @@ -454,3 +404,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( pow, _setup ) +#endif diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c index 52588a4..89b4785 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c @@ -140,28 +140,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) } enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15)); - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - outputScale = (float)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); - } - dstZP = 0; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - outputScale = 1.0f / attr[0]->asymm.scale; - dstZP = attr[0]->asymm.zero_point; - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - outputScale = 1; - dstZP = 0; - } + outputScale = 1.0f / attr[0]->scale; + dstZP = attr[0]->zero_point; shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c index 1973eb2..1ea8250 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c @@ -133,28 +133,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer) width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); - if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - outputScale = (float)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); - } - dstZP = 0.0f; - } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - outputScale = 1.0f / attr[0]->asymm.scale; - dstZP = (float)attr[0]->asymm.zero_point; - } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - outputScale = 1; - dstZP = 0.0f; - } + outputScale = 1.0f / attr[0]->scale; + dstZP = (float)attr[0]->zero_point; shaderParam.global_scale[0] = 16; shaderParam.global_scale[1] = 1; @@ -232,33 +212,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer) CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); out_shape = attr[0]->shape; - dstZP = (float)attr[0]->asymm.zero_point; - outputScale = attr[0]->asymm.scale; + dstZP = (float)attr[0]->zero_point; + outputScale = 1.0f / attr[0]->scale; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); - if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - outputScale = (float)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); - } - dstZP = 0.0f; - } - else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - outputScale = 1.0f/outputScale; - } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - outputScale = 1; - dstZP = 0.0f; - } - shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; @@ -499,8 +457,8 @@ OnError: } if (attr[1]) { - vsi_nn_kernel_tensor_attr_release( &attr[0] ); - attr[0] = NULL; + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; } return status; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c new file mode 100644 index 0000000..a58c823 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c @@ -0,0 +1,884 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16 \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16 \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8 \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8 \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOU8 \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI8 \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI8") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI16 \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI16") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOF16 \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toF16") + +// greater than a quarter +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8_GQ \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8_GQ \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16_GQ \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16_gq") +#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16_GQ \ + CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16_gq") + +#define KERNEL_SOURCE_1 "pre_process_nv12_rggb_copy", +#define KERNEL_SOURCE_2 "pre_process_nv12_rggb_scale", + +typedef enum +{ + COPY = 0, + SCALE, + TRANS +} vsi_nn_kernel_convert_type_e; + +#define HASH_PRE_PROCESS_NV12_RGGB_KEY(_input0_type, _output_type, _convert_type, _greater_quarter) \ + ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_greater_quarter)) + +#define TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \ + VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +#define TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \ + { HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 1), \ + VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE##_GQ, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } pre_process_nv12_rggb_map[] = +{ + TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, COPY, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_2) +}; + +static vx_param_description_t vxPreProcessNv12_RGGBKernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM _cnt_of_array(vxPreProcessNv12_RGGBKernel_param_def) + +static vsi_bool _check_nv12_type_from_env() +{ + vsi_bool ret = FALSE; + char* env_s = vsi_nn_getenv("VSI_NN_ENABLE_OCV_NV12"); + if (env_s) + { + ret = TRUE; + } + return ret; +} + +DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_copy_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + float output_zp = 0; + float output_scale = 1; + int32_t reorder = 0; + int32_t order1 = 3; + uint32_t width = 0; + uint32_t height = 0; + int32_t nv_type = 0; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f; + float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f; + float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f; + float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_size_array_t * out_shape = NULL; + vsi_bool ocv_nv12 = _check_nv12_type_from_env(); + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + output_scale = 1.0f / attr[0]->scale; + output_zp = (float)attr[0]->zero_point; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); + + if (reorder != 0) + { + reorder = 3; + order1 = 0; + } + + if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR) + { + int32_t tmporder = reorder; + reorder = order1; + order1 = tmporder; + } + + outputScaleVar_b = output_scale * b_scale; + outputScaleVar_g = output_scale * g_scale; + outputScaleVar_r = output_scale * r_scale; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r; + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 2); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertNV12toB_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00210000, 0x00630042, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000, + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertNV12toG_4x4 = {{ + 0x29292929, // TCfg + 0x14141414, // ASelt + 0x03210100, 0x07630542, // ABin + 0x2a2a2a2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc, + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertNV12toR_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00310010, 0x00730052, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractYtoShortSub16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x01000100, 0x03020302, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, + 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, + 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if (ocv_nv12) + { + uniConvertNV12toB_4x4.data[2] = 0x00010000; + uniConvertNV12toB_4x4.data[3] = 0x00230022; + uniConvertNV12toB_4x4.data[8] = 0x40093ca7; + uniConvertNV12toB_4x4.data[10] = 0x40093ca7; + uniConvertNV12toB_4x4.data[12] = 0x40093ca7; + uniConvertNV12toB_4x4.data[14] = 0x40093ca7; + + uniConvertNV12toG_4x4.data[2] = 0x01010100; + uniConvertNV12toG_4x4.data[3] = 0x03230322; + uniConvertNV12toG_4x4.data[8] = 0x36413ca7; + uniConvertNV12toG_4x4.data[9] = 0x00003a81; + uniConvertNV12toG_4x4.data[10] = 0x36413ca7; + uniConvertNV12toG_4x4.data[11] = 0x00003a81; + uniConvertNV12toG_4x4.data[12] = 0x36413ca7; + uniConvertNV12toG_4x4.data[13] = 0x00003a81; + uniConvertNV12toG_4x4.data[14] = 0x36413ca7; + uniConvertNV12toG_4x4.data[15] = 0x00003a81; + + uniConvertNV12toR_4x4.data[2] = 0x00110010; + uniConvertNV12toR_4x4.data[3] = 0x00330032; + uniConvertNV12toR_4x4.data[8] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[10] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[12] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[14] = 0x3e623ca7; + + uniExtractUVtoCharSub128_2x8.data[2] = 0x03020100; + uniExtractUVtoCharSub128_2x8.data[3] = 0x07060504; + + uniExtractYtoShortSub16_2x8.data[0] = 0x99999999; + uniExtractYtoShortSub16_2x8.data[1] = 0x44444444; + uniExtractYtoShortSub16_2x8.data[4] = 0xaaaaaaaa; + uniExtractYtoShortSub16_2x8.data[8] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[9] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[10] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[11] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[12] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[13] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[14] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[15] = 0x00010001; + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_2x8", &uniExtractYtoShortSub16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r); + status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4); + CHECK_STATUS_FAIL_GOTO(status, OnError); + switch( attr[0]->dtype ) + { + case U8: + case I8: + case I16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case F16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_nv12_rggb_copy_initializer() */ + +DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + float output_zp = 0; + float output_scale = 1; + int32_t reorder = 0; + int32_t order1 = 3; + uint32_t width = 0; + uint32_t height = 0; + uint32_t roi_width = 0; + uint32_t roi_height = 0; + uint32_t xrIntFloat_16 = 0; + uint32_t yrIntFloat_16 = 0; + int32_t xRatio = 0; + int32_t yRatio = 0; + int32_t nv_type = 0; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f; + float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f; + float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f; + float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; + float resize = 0.0f; + vsi_bool ocv_nv12 = _check_nv12_type_from_env(); + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_size_array_t * out_shape = NULL; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[1]->shape; + output_scale = 1.0f / attr[1]->scale; + output_zp = (float)attr[1]->zero_point; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); + + if (reorder != 0) + { + reorder = 3; + order1 = 0; + } + + if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR) + { + int32_t tmporder = reorder; + reorder = order1; + order1 = tmporder; + } + + roi_width = (xRatio * width) >> 15; + roi_height = (yRatio * height) >> 15; + resize = (float)width / roi_width; + xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1); + yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1); + + outputScaleVar_b = output_scale * b_scale; + outputScaleVar_g = output_scale * g_scale; + outputScaleVar_r = output_scale * r_scale; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r; + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1], 2); + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertNV12toB_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00210000, 0x00630042, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000, + 0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertNV12toG_4x4 = {{ + 0x29292929, // TCfg + 0x14141414, // ASelt + 0x03210100, 0x07630542, // ABin + 0x2a2a2a2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc, + 0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertNV12toR_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00310010, 0x00730052, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, + 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertYtoShortSub16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUVtoCharSub128_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + + //trans + gpu_dp_inst_t uniCalculateYShift_2x8 = {{ + 0x00009999, // TCfg + 0x00000000, // ASelt + 0x06040200, 0x00000000, // ABin + 0x00005555, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniCalculateUVShift_2x8 = {{ + 0x51515151, // TCfg + 0x40404040, // ASelt + 0x02020000, 0x06060404, // ABin + 0x91919191, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00010000, 0x00000000, 0x00010000, + 0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, + 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, + 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if (ocv_nv12) + { + uniConvertNV12toB_4x4.data[2] = 0x00010000; + uniConvertNV12toB_4x4.data[3] = 0x00230022; + uniConvertNV12toB_4x4.data[8] = 0x40093ca7; + uniConvertNV12toB_4x4.data[10] = 0x40093ca7; + uniConvertNV12toB_4x4.data[12] = 0x40093ca7; + uniConvertNV12toB_4x4.data[14] = 0x40093ca7; + + uniConvertNV12toG_4x4.data[2] = 0x01010100; + uniConvertNV12toG_4x4.data[3] = 0x03230322; + uniConvertNV12toG_4x4.data[8] = 0x36413ca7; + uniConvertNV12toG_4x4.data[9] = 0x00003a81; + uniConvertNV12toG_4x4.data[10] = 0x36413ca7; + uniConvertNV12toG_4x4.data[11] = 0x00003a81; + uniConvertNV12toG_4x4.data[12] = 0x36413ca7; + uniConvertNV12toG_4x4.data[13] = 0x00003a81; + uniConvertNV12toG_4x4.data[14] = 0x36413ca7; + uniConvertNV12toG_4x4.data[15] = 0x00003a81; + + uniConvertNV12toR_4x4.data[2] = 0x00110010; + uniConvertNV12toR_4x4.data[3] = 0x00330032; + uniConvertNV12toR_4x4.data[8] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[10] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[12] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[14] = 0x3e623ca7; + + uniConvertYtoShortSub16_2x8.data[0] = 0x99999999; + uniConvertYtoShortSub16_2x8.data[1] = 0x44444444; + uniConvertYtoShortSub16_2x8.data[4] = 0xaaaaaaaa; + uniConvertYtoShortSub16_2x8.data[8] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[9] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[10] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[11] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[12] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[13] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[14] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[15] = 0x00010001; + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYtoShortSub16_2x8", &uniConvertYtoShortSub16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16); + status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r); + status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); + + if (resize >= 0.25) + { + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); + status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( attr[1]->dtype ) + { + case U8: + case I8: + case I16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case F16: + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + return status; +} /* _pre_process_nv12_rggb_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params, + int32_t scale_x + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_convert_type_e convert_type = SCALE; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + size_t i = 0; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + vsi_size_t dstWidth = outputs[0]->attr.size[0]; + float scaleVal = (float)dstWidth / ((scale_x * dstWidth) >> 15); + uint32_t optFlg = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (enable_copy) + { + convert_type = COPY; + } + else + { + convert_type = SCALE; + } + + if (scaleVal >= 0.25 && convert_type == SCALE) + { + optFlg = 1; + } + + key = HASH_PRE_PROCESS_NV12_RGGB_KEY( input0_dtype, output_dtype, convert_type, optFlg ); + + for ( i = 0; i < _cnt_of_array(pre_process_nv12_rggb_map); i ++ ) + { + if ( pre_process_nv12_rggb_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(pre_process_nv12_rggb_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_nv12_rggb_map[i].function_name ); + kernel->info.parameters = vxPreProcessNv12_RGGBKernel_param_def; + kernel->info.numParams = _cnt_of_array( vxPreProcessNv12_RGGBKernel_param_def ); + + if (convert_type == COPY) + { + kernel->info.initialize = _pre_process_nv12_rggb_copy_initializer; + } + else + { + kernel->info.initialize = _pre_process_nv12_rggb_initializer; + } + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + pre_process_nv12_rggb_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + pre_process_nv12_rggb_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t trans = 0; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params, scale_x ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 3; + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" ); + float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" ); + float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t nv_type = vsi_nn_kernel_param_get_int32( params, "nv_type" ); + + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM, + inputs, 2, outputs, 1 ); + + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &nv_type ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + vsi_nn_kernel_scalar_release( &tmp_params[8] ); + vsi_nn_kernel_scalar_release( &tmp_params[9] ); + vsi_nn_kernel_scalar_release( &tmp_params[10] ); + vsi_nn_kernel_scalar_release( &tmp_params[11] ); + vsi_nn_kernel_scalar_release( &tmp_params[12] ); + vsi_nn_kernel_scalar_release( &tmp_params[13] ); + vsi_nn_kernel_scalar_release( &tmp_params[14] ); + vsi_nn_kernel_scalar_release( &tmp_params[15] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pre_process_nv12_rggb, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c index 256f7e5..d9f96b2 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c @@ -403,23 +403,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer) out_shape = attr[0]->shape; width = (uint32_t)(out_shape->data[0]); - - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if ( attr[0]->dfp.fl > 0 ) - { - output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - output_zp = (float)attr[0]->asymm.zero_point; - output_scale /= attr[0]->asymm.scale; - } + output_zp = (float)attr[0]->zero_point; + output_scale = 1.0f / attr[0]->scale; shaderParam.global_scale[0] = 16; shaderParam.global_scale[1] = 1; @@ -620,8 +605,8 @@ OnError: } if (attr[1]) { - vsi_nn_kernel_tensor_attr_release( &attr[0] ); - attr[0] = NULL; + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; } return status; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c index ae559da..0504dff 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c @@ -463,22 +463,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer) width = (uint32_t)(out_shape->data[0] / 3); height = (uint32_t)(out_shape->data[1]); - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if ( attr[0]->dfp.fl > 0 ) - { - output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - output_zp = (float)attr[0]->asymm.zero_point; - output_scale /= attr[0]->asymm.scale; - } + output_zp = (float)attr[0]->zero_point; + output_scale = 1.0f / attr[0]->scale; if (attr[0]->dtype == F16 || attr[0]->dtype == I16) { @@ -787,8 +773,8 @@ OnError: } if (attr[1]) { - vsi_nn_kernel_tensor_attr_release( &attr[0] ); - attr[0] = NULL; + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; } return status; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c index 984293b..dd27137 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c @@ -179,28 +179,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) } enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15)); - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - outputScale = (float)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); - } - outputZP = 0; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - outputScale = 1.0f / attr[0]->asymm.scale; - outputZP = (float)attr[0]->asymm.zero_point; - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - outputScale = 1; - outputZP = 0; - } + outputScale = 1.0f / attr[0]->scale; + outputZP = (float)attr[0]->zero_point; #define _PACK_SELECT_KEY( COPY_FLAG, REVERSE_FLAG, TRANS_FLAG) \ (COPY_FLAG | (REVERSE_FLAG << 24) | (TRANS_FLAG << 16) ) diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c index 4c322a8..1956b29 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c @@ -143,23 +143,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) order1 = 0; } - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - dstScale = 1.0f / attr[0]->asymm.scale; - dstZP = attr[0]->asymm.zero_point; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); - } - dstZP = 0; - } + dstScale = 1.0f / attr[0]->scale; + dstZP = attr[0]->zero_point; shaderParam.global_scale[0] = 16; shaderParam.global_scale[1] = 1; @@ -501,8 +486,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; - dstZP = attr[0]->asymm.zero_point; - dstScale = attr[0]->asymm.scale; + dstZP = attr[0]->zero_point; + dstScale = 1.0f / attr[0]->scale; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); @@ -512,28 +497,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) order1 = 0; } - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl)); - } - dstZP = 0; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - dstScale = 1.0f / dstScale; - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) - { - dstScale = 1; - dstZP = 0; - } - shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c index bed0b6c..4703424 100644 --- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c @@ -164,46 +164,24 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); - out_shape = attr[2]->shape; + out_shape = attr[2]->shape; + inputZP0 = attr[0]->zero_point; + input_scale0 = attr[0]->scale; + inputZP1 = attr[1]->zero_point; + input_scale1 = attr[1]->scale; + outputZP = (float)attr[2]->zero_point; + input_scale0 = input_scale0 / attr[2]->scale; + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { in0_fl = (int8_t)attr[0]->dfp.fl; - if (in0_fl >= 0) - { - input_scale0 = 1.0f / (vx_float32) ((int64_t)1 << in0_fl); - } - else if (in0_fl < 0) - { - input_scale0 = (vx_float32) ((int64_t)1 << -in0_fl); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inputZP0 = attr[0]->asymm.zero_point; - input_scale0 = attr[0]->asymm.scale; - } - - if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inputZP1 = attr[1]->asymm.zero_point; - input_scale1 = attr[1]->asymm.scale; } if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) { out_fl = (int8_t)attr[2]->dfp.fl; + } - if (out_fl >= 0) - input_scale0 *= (vx_float32)((int64_t)1 << out_fl); - else if (out_fl < 0) - input_scale0 *= 1.0f / (vx_float32) ((int64_t)1 << -out_fl); - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - out_fl = 1; - outputZP = (float)attr[2]->asymm.zero_point; - input_scale0 = input_scale0 / attr[2]->asymm.scale; - } shift0 = in0_fl - out_fl; is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1); diff --git a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c index efb52f0..8e71126 100644 --- a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c @@ -152,7 +152,6 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t * input_shape = NULL; vsi_size_array_t * output_shape = NULL; - int32_t input_fl = 0, output_fl = 0; int32_t axisSize = 0; float inputScale = 1.0f; float input_offset_asymmetric = 0.0f; @@ -257,68 +256,19 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) } } - if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - input_fl = input_attr->dfp.fl; - if (input_fl > 0) - { - inputScale = 1.0f / (float) ((int64_t)1 << input_fl); - } - else - { - inputScale = (float)((int64_t)1 << -input_fl); - } - status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inputScale = input_attr->asymm.scale; - input_offset_asymmetric = (float)(input_attr->asymm.zero_point); - status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else - { - inputScale = 1.0f; - input_offset_asymmetric = 0; + inputScale = input_attr->scale; + input_offset_asymmetric = (float)(input_attr->zero_point); + outputScale = 1.0f / output_attr->scale; + output_offset_asymmetric = (float)(output_attr->zero_point); - status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); - if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - output_fl = output_attr->dfp.fl; - if (output_fl > 0) - { - outputScale = (float) ((int64_t)1 << output_fl); - } - else - { - outputScale = 1.0f / (float)((int64_t)1 << -output_fl); - } - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - outputScale = 1.0f / output_attr->asymm.scale; - output_offset_asymmetric = (float)(output_attr->asymm.zero_point); - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else - { - outputScale = 1.0f; - output_offset_asymmetric = 0; - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize ); CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_gpu_config( node, &gpu_param ); diff --git a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c index d9bd40d..aabac06 100644 --- a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c @@ -154,7 +154,6 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t * input_shape = NULL; vsi_size_array_t * output_shape = NULL; - int32_t input_fl = 0, output_fl = 0; int32_t axisSize = 0; float inputScale = 1.0f; float input_offset_asymmetric = 0.0f; @@ -259,68 +258,18 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) } } - if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - input_fl = input_attr->dfp.fl; - if (input_fl > 0) - { - inputScale = 1.0f / (float) ((int64_t)1 << input_fl); - } - else - { - inputScale = (float)((int64_t)1 << -input_fl); - } - status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inputScale = input_attr->asymm.scale; - input_offset_asymmetric = (float)(input_attr->asymm.zero_point); - status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else - { - inputScale = 1.0f; - input_offset_asymmetric = 0; + inputScale = input_attr->scale; + input_offset_asymmetric = (float)(input_attr->zero_point); + outputScale = 1.0f / output_attr->scale; + output_offset_asymmetric = (float)(output_attr->zero_point); - status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); - if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - output_fl = output_attr->dfp.fl; - if (output_fl > 0) - { - outputScale = (float) ((int64_t)1 << output_fl); - } - else - { - outputScale = 1.0f / (float)((int64_t)1 << -output_fl); - } - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - outputScale = 1.0f / output_attr->asymm.scale; - output_offset_asymmetric = (float)(output_attr->asymm.zero_point); - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else - { - outputScale = 1.0f; - output_offset_asymmetric = 0; - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize ); CHECK_STATUS_FAIL_GOTO(status, final ); diff --git a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c index 3c710f5..952063a 100644 --- a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c @@ -160,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) vsi_size_array_t * output_shape = NULL; vsi_nn_kernel_dtype_e src_dtype = F16; vsi_nn_kernel_dtype_e dst_dtype = F16; - int32_t input_fl = 0, output_fl = 0; int32_t axisSize = 0; float inputScale = 1.0f; float input_offset_asymmetric = 0.0f; @@ -348,68 +347,17 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } - if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - input_fl = input_attr->dfp.fl; - if (input_fl > 0) - { - inputScale = 1.0f / (float) ((int64_t)1 << input_fl); - } - else - { - inputScale = (float)((int64_t)1 << -input_fl); - } - status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inputScale = input_attr->asymm.scale; - input_offset_asymmetric = (float)(input_attr->asymm.zero_point); - status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else - { - inputScale = 1.0f; - input_offset_asymmetric = 0; + inputScale = input_attr->scale; + input_offset_asymmetric = (float)(input_attr->zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); - status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - - if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - output_fl = output_attr->dfp.fl; - if (output_fl > 0) - { - outputScale = (float) ((int64_t)1 << output_fl); - } - else - { - outputScale = 1.0f / (float)((int64_t)1 << -output_fl); - } - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - outputScale = 1.0f / output_attr->asymm.scale; - output_offset_asymmetric = (float)(output_attr->asymm.zero_point); - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else - { - outputScale = 1.0f; - output_offset_asymmetric = 0; - status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } + outputScale = 1.0f / output_attr->scale; + output_offset_asymmetric = (float)(output_attr->zero_point); + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_gpu_config( node, &gpu_param ); diff --git a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c index 1311117..4dcc321 100644 --- a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c @@ -138,8 +138,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer) float inputTail = 0.0f; float output_ZP = 0; float input_ZP = 0; - int32_t srcFixPointPos = 0; - int32_t dstFixPointPos = 0; VSI_UNREFERENCED(param_size); @@ -154,25 +152,10 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer) output_dtype = output_attr->dtype; offset = alpha * threshold; - if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) - { - srcFixPointPos = input_attr->dfp.fl; - } - else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant) - { - input_ZP = (float)(input_attr->asymm.zero_point); - scaleIn = input_attr->asymm.scale; - } - - if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) - { - dstFixPointPos = output_attr->dfp.fl; - } - else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) - { - output_ZP = (float)(output_attr->asymm.zero_point); - scaleOut = 1.0f / output_attr->asymm.scale; - } + input_ZP = (float)(input_attr->zero_point); + scaleIn = input_attr->scale; + output_ZP = (float)(output_attr->zero_point); + scaleOut = 1.0f / output_attr->scale; gpu_param.global_scale[0] = 8; gpu_param.global_scale[1] = 1; @@ -195,11 +178,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer) } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) { - if (srcFixPointPos >=0 ) - scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos); - else - scaleIn = (float) ((int64_t)1 << -srcFixPointPos); - status = vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); CHECK_STATUS_FAIL_GOTO(status, final ); } @@ -212,11 +190,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer) } else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) { - if (dstFixPointPos >=0 ) - scaleOut = (float) ((int64_t)1 << dstFixPointPos); - else - scaleOut = 1.0f / (float) ((int64_t)1 << -dstFixPointPos); - status = vsi_nn_kernel_gpu_add_param(node, "output_scale", &scaleOut); CHECK_STATUS_FAIL_GOTO(status, final ); } diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c index 95c33b8..783f5f9 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c @@ -197,8 +197,6 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) int32_t half_pixel_centers = 0; uint32_t depth = 0; - int32_t srcFixPointPos = 0; - int32_t dstFixPointPos = 0; float input_scale = 1.0; int32_t inputZP = 0; float output_scale = 1.0; @@ -259,53 +257,10 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) half_pixel_value = 0.0f; } - if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) - { - input_scale = input_attr->asymm.scale; - inputZP = input_attr->asymm.zero_point; - } - else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) - { - srcFixPointPos = input_attr->dfp.fl; - if (srcFixPointPos >= 0) - { - input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); - } - else if (srcFixPointPos < 0) - { - input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos); - } - inputZP = 0; - } - else - { - input_scale = 1.0f; - inputZP = 0; - } - - if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) - { - output_scale = output_attr->asymm.scale; - outputZP = output_attr->asymm.zero_point; - } - else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) - { - dstFixPointPos = output_attr->dfp.fl; - if (dstFixPointPos >= 0) - { - output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos); - } - else if (dstFixPointPos < 0) - { - output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); - } - outputZP = 0; - } - else - { - output_scale = 1.0; - outputZP = 0; - } + input_scale = input_attr->scale; + inputZP = input_attr->zero_point; + output_scale = output_attr->scale; + outputZP = output_attr->zero_point; if (is_run_nx_kernel) { @@ -473,7 +428,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) { - float dfpScale = input_scale * output_scale; + float dfpScale = input_scale / output_scale; gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c index fddd1e3..97c83ff 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c @@ -198,52 +198,19 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer) half_pixel_value = 0.0f; } - if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) - { - input_scale = input_attr->asymm.scale; - inputZP = input_attr->asymm.zero_point; - } - else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + input_scale = input_attr->scale; + inputZP = input_attr->zero_point; + output_scale = 1.0f / output_attr->scale; + outputZP = output_attr->zero_point; + + if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) { srcFixPointPos = input_attr->dfp.fl; - if (srcFixPointPos >= 0) - { - input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos); - } - else if (srcFixPointPos < 0) - { - input_scale = (float)((int64_t)1 << -srcFixPointPos); - } - inputZP = 0; - } - else - { - input_scale = 1.0f; - inputZP = 0; } - if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) - { - output_scale = 1.0f / output_attr->asymm.scale; - outputZP = output_attr->asymm.zero_point; - } - else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) { dstFixPointPos = output_attr->dfp.fl; - if (dstFixPointPos >= 0) - { - output_scale = (float) ((int64_t)1 << dstFixPointPos); - } - else if (dstFixPointPos < 0) - { - output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos); - } - outputZP = 0; - } - else - { - output_scale = 1.0; - outputZP = 0; } if (F16 == input_dtype && F16 == output_dtype) diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index ebfe9ed..d3d3375 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -122,12 +122,16 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] = PACK_KERNEL_MAP_DOWN(I16, I16), PACK_KERNEL_MAP_DOWN(U8, F16), PACK_KERNEL_MAP_DOWN(U8, U8), + PACK_KERNEL_MAP_DOWN(U16, F16), + PACK_KERNEL_MAP_DOWN(U16, U16), PACK_KERNEL_MAP_DOWN(F16, F16), PACK_KERNEL_MAP_DOWN(F16, U8), + PACK_KERNEL_MAP_DOWN(F16, U16), PACK_KERNEL_MAP_DOWN(BF16, BF16), PACK_KERNEL_MAP_UP(I8, I8), PACK_KERNEL_MAP_UP(I16, I16), PACK_KERNEL_MAP_UP(U8, U8), + PACK_KERNEL_MAP_UP(U16, U16), PACK_KERNEL_MAP_UP(F16, F16), PACK_KERNEL_MAP_UP(BF16, BF16), PACK_KERNEL_MAP_UP_OPT(U8, U8), @@ -223,8 +227,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) int32_t half_pixel_centers; uint32_t depth = 0; - int32_t srcFixPointPos = 0; - int32_t dstFixPointPos = 0; float input_scale = 1.0; int32_t inputZP = 0; float output_scale = 1.0; @@ -285,201 +287,16 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) half_pixel_value = 0.0f; } - if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) - { - input_scale = input_attr->asymm.scale; - inputZP = input_attr->asymm.zero_point; - } - else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) - { - srcFixPointPos = input_attr->dfp.fl; - if (srcFixPointPos >= 0) - { - input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos); - } - else if (srcFixPointPos < 0) - { - input_scale = (float)((int64_t)1 << -srcFixPointPos); - } - inputZP = 0; - } - else - { - input_scale = 1.0f; - inputZP = 0; - } - - if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) - { - output_scale = output_attr->asymm.scale; - outputZP = output_attr->asymm.zero_point; - } - else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) - { - dstFixPointPos = output_attr->dfp.fl; - if (dstFixPointPos >= 0) - { - output_scale = (float) ((int64_t)1 << dstFixPointPos); - } - else if (dstFixPointPos < 0) - { - output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos); - } - outputZP = 0; - } - else - { - output_scale = 1.0; - outputZP = 0; - } + input_scale = input_attr->scale; + inputZP = input_attr->zero_point; + output_scale = output_attr->scale; + outputZP = output_attr->zero_point; gpu_param.global_scale[0] = 4; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) - { - float dfpScale = input_scale * output_scale; - gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000300, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, - 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniExtact8Bit_2x8 = {{ - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniRightSubLeft_4x4 = {{ - 0x09090909, // TCfg - 0x00000000, // ASelt - 0x00230001, 0x00670045, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00000000, 0x00010001, 0x00000000, - 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00020000, 0x00060004, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, - 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - - if (I8 == input_dtype && I8 == output_dtype && out_width > in_width) - { - gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniGetMaskShift_2x8 = {{ - 0x99999999, // TCfg - 0x00000000, // ASelt - 0x03020100, 0x07060504, // ABin - 0x55555555, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{ - 0x09090909, // TCfg - 0x00000000, // ASelt - 0x00150004, 0x00370026, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000300, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00000000, 0x00010001, 0x00000000, - 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - - status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", - &uniConvertDFP2FP32_part1_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); - CHECK_STATUS_FAIL_GOTO(status, final ); - - gpu_param.global_scale[2] = depth; - } - else if (I16 == input_dtype && I16 == output_dtype && out_width > in_width) - { - gpu_dp_inst_t uniConvertI32toI16_2x8 = {{ - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniGetMaskShift_2x8 = {{ - 0x99999999, // TCfg - 0x00000000, // ASelt - 0x03020100, 0x07060504, // ABin - 0x55555555, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{ - 0x09090909, // TCfg - 0x00000000, // ASelt - 0x00150004, 0x00370026, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000300, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00000000, 0x00010001, 0x00000000, - 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - - status = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", - &uniConvertDFP2FP32_part1_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth); - CHECK_STATUS_FAIL_GOTO(status, final ); - - gpu_param.global_scale[2] = depth; - } - else - { - status = vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - - status = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor); - status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if (U8 == input_dtype && (U8 == output_dtype || F16 == output_dtype)) + if ((U8 == input_dtype || U16 == input_dtype || I8 == input_dtype || I16 == input_dtype)) { float uint8Scale = input_scale / output_scale; float uint8ZP_out = (float)outputZP; @@ -615,7 +432,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) } CHECK_STATUS_FAIL_GOTO(status, final ); } - else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype)) + else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype || U16 == output_dtype)) { float uint8Scale = 1.0f / output_scale; float uint8ZP_out = (float)outputZP; diff --git a/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c new file mode 100644 index 0000000..618b33f --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c @@ -0,0 +1,453 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +#define _RESIZE_CUBIC_KERNEL_SOURCE() "resize_cubic" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) ) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("evis.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_CUBIC_KERNEL_SOURCE() } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_cubic_kernel_map[] = +{ + PACK_KERNEL_MAP(F16, F16), + PACK_KERNEL_MAP(I16, I16), + PACK_KERNEL_MAP(F16, I16), + PACK_KERNEL_MAP(I16, F16), + PACK_KERNEL_MAP(I8, I8), + PACK_KERNEL_MAP(F16, I8), + PACK_KERNEL_MAP(I8, F16), + PACK_KERNEL_MAP(U8, U8), + PACK_KERNEL_MAP(F16, U8), + PACK_KERNEL_MAP(U8, F16), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_cubic_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define RESIZE_CUBIC_NUM _cnt_of_array( _resize_cubic_kernel_param_def ) + + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_cubic_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t *input_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_size_array_t * out_shape = NULL; + + float input_scale = 1.0; + float input_tail = 0; + float output_scale = 1.0; + float output_tail = 0; + + VSI_UNREFERENCED(param_size); + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0]); + CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1]); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + out_shape = output_attr->shape; + + if ( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = input_attr->dfp.fl; + if (fl > 0) + { + input_scale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + input_scale = (float)((int64_t)1 << -fl); + } + } + else if ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input_scale = input_attr->asymm.scale; + input_tail = 0 - input_scale * (float)input_attr->asymm.zero_point; + } + + if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = output_attr->dfp.fl; + if (fl > 0) + { + output_scale = (float) ((int64_t)1 << fl); + } + else + { + output_scale = 1.0f / (float)((int64_t)1 << -fl); + } + } + else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + output_scale = 1.0f / output_attr->asymm.scale; + output_tail = (float)output_attr->asymm.zero_point; + } + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + { + gpu_dp_inst_t uniFp16ToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniExtract8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param( node, "uniFp16ToFp32_4x4", &uniFp16ToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8); + } + status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "input_tail", &input_tail); + status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_tail", &output_tail); + + + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _resize_cubic_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_cubic_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_cubic_kernel_map ); + vx_param_description_t * param_def = _resize_cubic_kernel_param_def; + size_t param_def_size = RESIZE_CUBIC_NUM; + vx_kernel_initialize_f initializer = _resize_cubic_initializer; + + uint32_t key = 0; + uint32_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + +static vsi_nn_tensor_t* _create_scale_tensor + ( + vsi_nn_graph_t *graph, + vsi_size_t output_size, + float scale_factor, + float half_pixel_value, + vsi_nn_tensor_t** index + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* scale = NULL; + vsi_size_t i = 0; + float *scale_data_ptr = NULL; + int *index_data_ptr = NULL; + float scale_value = 0; + vsi_ssize_t data = 0; + int idx = 0; + float delta_v = 0; + float cubic_coeff_a = -0.5f; + vsi_size_t item_count = 4 * output_size; + scale_data_ptr = (float *)malloc(item_count * sizeof(float)); + if (scale_data_ptr == NULL) + { + VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__); + goto OnError; + } + + index_data_ptr = (int *)malloc(output_size * sizeof(int)); + if (index_data_ptr == NULL) + { + VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__); + goto OnError; + } + + for (i = 0; i < output_size; i ++) + { + scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value; + data = (vsi_ssize_t)scale_value; + delta_v = scale_value - (float)data; + idx = (int)data - 1; + + index_data_ptr[i] = idx; + scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4); + scale_data_ptr[i * 4 + 1] = ((cubic_coeff_a + 2) * delta_v - (cubic_coeff_a + 3)) * delta_v *delta_v + 1; + scale_data_ptr[i * 4 + 2] = ((cubic_coeff_a + 2) * (1 - delta_v) - (cubic_coeff_a + 3)) + * (1 - delta_v) * (1 - delta_v) + 1; + scale_data_ptr[i * 4 + 3] = cubic_coeff_a * ((( 2 - delta_v - 5) * (2 - delta_v) + 8) * (2 - delta_v) - 4); + } + attr.size[0] = item_count; + attr.dim_num = 1; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.vtl = FALSE; + + scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr); + if (scale_data_ptr) + { + free (scale_data_ptr); + scale_data_ptr = NULL; + } + + attr.size[0] = output_size; + attr.dim_num = 1; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.vtl = FALSE; + + *index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr); + if (index_data_ptr) + { + free (index_data_ptr); + index_data_ptr = NULL; + } + +OnError: + return scale; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + vsi_size_t in_width = inputs[0]->attr.size[0]; + vsi_size_t in_height = inputs[0]->attr.size[1]; + vsi_size_t out_width = outputs[0]->attr.size[0]; + vsi_size_t out_height = outputs[0]->attr.size[1]; + float half_pixel_value = 0.0f; + float width_scale = 0.0f; + float height_scale = 0.0f; + vsi_nn_tensor_t* scale_w = NULL; + vsi_nn_tensor_t* scale_h = NULL; + vsi_nn_tensor_t* index_w = NULL; + vsi_nn_tensor_t* index_h = NULL; + + if (align_corners && out_width > 1) + { + width_scale = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + width_scale = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (align_corners && out_height > 1) + { + height_scale = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1); + } + else + { + height_scale = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + size_t node_params_num = RESIZE_CUBIC_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_NUM, + inputs, input_num, outputs, output_num ); + scale_w = _create_scale_tensor(graph, out_width,\ + width_scale, half_pixel_value, &index_w); + CHECK_PTR_FAIL_GOTO( scale_w, "Create buffer fail.", final ); + CHECK_PTR_FAIL_GOTO( index_w, "Create buffer fail.", final ); + scale_h = _create_scale_tensor(graph, out_height,\ + height_scale, half_pixel_value, &index_h); + CHECK_PTR_FAIL_GOTO( scale_h, "Create buffer fail.", final ); + CHECK_PTR_FAIL_GOTO( index_h, "Create buffer fail.", final ); + node_params[2] = (vsi_nn_kernel_node_param_t)(scale_w->t); + node_params[3] = (vsi_nn_kernel_node_param_t)(scale_h->t); + node_params[4] = (vsi_nn_kernel_node_param_t)(index_w->t); + node_params[5] = (vsi_nn_kernel_node_param_t)(index_h->t); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + } + } + +final: + vsi_safe_release_tensor(scale_w); + vsi_safe_release_tensor(scale_h); + vsi_safe_release_tensor(index_w); + vsi_safe_release_tensor(index_h); + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( resize_cubic, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c index 6bf9ba8..99312c1 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c @@ -208,52 +208,19 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) half_pixel_value = 0.0f; } - if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) - { - input_scale = input_attr->asymm.scale; - inputZP = input_attr->asymm.zero_point; - } - else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + input_scale = input_attr->scale; + inputZP = input_attr->zero_point; + output_scale = 1.0f / output_attr->scale; + outputZP = output_attr->zero_point; + + if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) { srcFixPointPos = input_attr->dfp.fl; - if (srcFixPointPos >= 0) - { - input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos); - } - else if (srcFixPointPos < 0) - { - input_scale = (float)((int64_t)1 << -srcFixPointPos); - } - inputZP = 0; - } - else - { - input_scale = 1.0f; - inputZP = 0; } - if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) - { - output_scale = 1.0f / output_attr->asymm.scale; - outputZP = output_attr->asymm.zero_point; - } - else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) { dstFixPointPos = output_attr->dfp.fl; - if (dstFixPointPos >= 0) - { - output_scale = (float) ((int64_t)1 << dstFixPointPos); - } - else if (dstFixPointPos < 0) - { - output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos); - } - outputZP = 0; - } - else - { - output_scale = 1.0; - outputZP = 0; } if (F16 == input_dtype && F16 == output_dtype) diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c index bba21ea..e52d396 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c @@ -208,10 +208,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer) height = (int32_t)(attr[2]->shape->data[1]); index_num = (int32_t)(attr[0]->shape->data[1]); - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - output_zp = attr[2]->asymm.zero_point; - } + output_zp = attr[2]->zero_point; if (coord_dim == 3) { @@ -367,10 +364,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer) height = (int32_t)(attr[2]->shape->data[1]); index_num = (int32_t)(attr[0]->shape->data[1]); - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - output_zp = attr[2]->asymm.zero_point; - } + output_zp = attr[2]->zero_point; if (coord_dim == 3) { diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c index b59bccf..4786abb 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c @@ -382,6 +382,12 @@ static vsi_status check_scatter_nd_update_index_repeat int32_t* mask_buffer = NULL; int32_t mask_len = 0; + if (indices_num == 1) + { + isRepeat[0] = 0; + return VSI_SUCCESS; + } + if (inputs[1]->attr.is_const == FALSE) { isRepeat[0] = 1; @@ -451,7 +457,7 @@ static vsi_status check_scatter_nd_update_index_repeat else if (mask_buffer[mask_idx] > 0) { isRepeat[0] = 1; - status = VSI_FAILURE; + status = VSI_SUCCESS; CHECK_STATUS_FAIL_GOTO( status, final ); } } diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c new file mode 100644 index 0000000..f2ffdc0 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c @@ -0,0 +1,861 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +typedef enum +{ + NONE = 0, + Add, + Mul, + Max, + Min +} vsi_scatter_nd_update_type_e; + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "scatter_nd_update_reduction" +#define KERNEL_SOURCE_2 "scatter_nd_update_reduction_conv" + +#define HASH_SCATTER_ND_UPDATE_KEY(_in0_type, _in2_type, _out_type, _stage, _op_type) \ + ((_in0_type << 24) | (_in2_type << 16) | (_out_type << 8) | (_stage << 4) | (_op_type)) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_conv_"#DST_TYPE) + +#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \ + SOURCE }, + +#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \ + SOURCE }, + +#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16, KERNEL_SOURCE_1) +}; + +static const _kernel_map_type scatter_nd_update_reduction_process_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, BF16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, BF16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, BF16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, BF16, KERNEL_SOURCE_1) +}; + +static const _kernel_map_type scatter_nd_update_reduction_conv_map[] = +{ + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(BF16, KERNEL_SOURCE_2) +}; + +/* + * Kernel params + */ +static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; + +#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def) +#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM _cnt_of_array(_scatter_nd_update_process_kernel_param_def) +#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM _cnt_of_array(_scatter_nd_update_conv_kernel_param_def) + +static vsi_status get_scatter_nd_update_tensor_reshape_size + ( + vsi_nn_tensor_t ** inputs, + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], + uint32_t block_size, + uint32_t coordDim, + vsi_size_t strides[VSI_NN_MAX_DIM_NUM], + int32_t* newDim + ) +{ + vsi_status status = VSI_SUCCESS; + uint32_t dims_num = inputs[0]->attr.dim_num; + vsi_size_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + vsi_size_t elementCnt = 1; + +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH + + newDim[0] = 0; + for (i = 0; i < dims_num; ++i) + { + elementCnt *= input_size[i]; + } + + for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + { + sizes[i] = 1; + } + + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + newDim[0] = 2; + + if (coordDim == 1 && strides) // index shape + { + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + strides[i] = 0; + } + } + else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides) + { + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + strides[i] = 0; + } + + strides[0] = input_size[dims_num - coordDim]; + for (i = 1; i < coordDim - 1; i++) + { + strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i]; + } + } + +#undef VSI_NN_MAX_IMAGE_WIDTH + + return status; +} /* _get_EltOP_tensor_reshape_size */ + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_scatter_nd_update_preprocess_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 1, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t width = 0; + int32_t element_size = 1; + int32_t input_zp0 = 0; + float input_scale0 = 1; + int32_t i = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + for (i = 0; i < (int32_t)attr[0]->shape->size; i++) + { + element_size *= (int32_t)attr[0]->shape->data[i]; + } + width = element_size / 8; + + input_zp0 = attr[0]->zero_point; + input_scale0 = attr[0]->scale; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + input_scale0 = 1.0f; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + if (element_size < 8) + { + gpu_param.global_size[0] = element_size; + } + else + { + gpu_param.global_size[0] = width; + } + gpu_param.global_size[1] = 1; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert2ndU8SubZpToFp32_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvert2ndU8SubZpToFp32_4x4", &uniConvert2ndU8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &input_zp0 ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _scatter_nd_update_preprocess_initializer() */ + +DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + int32_t block_size = 1; + int32_t update_width = 1; + int32_t index_num = 1; + int32_t width = 0; + int32_t coord_dim = 0; + int32_t strides[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t coord_strides[8] = {0}; + int32_t coord_strides1[4] = {0}; + int32_t input_zp2 = 0; + float input_scale2 = 1; + int32_t i = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &strides[0]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &strides[1]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[2]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[3]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[4]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[5]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[6]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &coord_dim); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + block_size = (int32_t)(attr[2]->shape->data[0]); + update_width = (int32_t)(attr[1]->shape->data[0]); + index_num = (int32_t)(attr[0]->shape->data[1]); + width = block_size; + + input_zp2 = attr[1]->zero_point; + input_scale2 = attr[1]->scale; + + coord_strides[coord_dim - 1] = 1; + for (i = 0; i < coord_dim - 1; i++) + { + coord_strides[i] = strides[coord_dim - 2 - i]; + } + memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t)); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = width; + gpu_param.global_size[1] = index_num; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides ); + status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "update_scale", &input_scale2 ); + status |= vsi_nn_kernel_gpu_add_param( node, "update_zp", &input_zp2 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if (attr[1]->dtype == BF16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + return status; +} /* _scatter_nd_update_process_initializer() */ + +DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 1, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t width = 0; + int32_t element_size = 1; + int32_t i = 0; + float output_zp = 0; + float output_scale = 1.0f; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + output_zp = (float)attr[0]->zero_point; + output_scale = (float)1.0 / attr[0]->scale; + + for (i = 0; i < (int32_t)attr[0]->shape->size; i++) + { + element_size *= (int32_t)attr[0]->shape->data[i]; + } + width = element_size / 8; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + if (element_size < 8) + { + gpu_param.global_size[0] = element_size; + } + else + { + gpu_param.global_size[0] = width; + } + gpu_param.global_size[1] = 1; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _scatter_nd_update_conv_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel_preprocess, + vsi_nn_kernel_t* kernel_process, + vsi_nn_kernel_t* kernel_conv, + int32_t reduction_flg + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input2_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + size_t i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_SCATTER_ND_UPDATE_KEY(input0_dtype, 0, 0, 0, 0); + + for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ ) + { + if ( scatter_nd_update_reduction_preprocess_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) ) + { + snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_reduction_preprocess_map[i].function_name ); + kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def; + kernel_preprocess->info.numParams = _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM; + kernel_preprocess->info.initialize = _scatter_nd_update_preprocess_initializer; + + vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + scatter_nd_update_reduction_preprocess_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_reduction_preprocess_map[i].source_name ); + } + else + { + status = VSI_FAILURE; + } + + key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg); + + for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ ) + { + if ( scatter_nd_update_reduction_process_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) ) + { + snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_reduction_process_map[i].function_name ); + kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def; + kernel_process->info.numParams = _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM; + kernel_process->info.initialize = _scatter_nd_update_process_initializer; + + vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + scatter_nd_update_reduction_process_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_reduction_process_map[i].source_name ); + } + else + { + status |= VSI_FAILURE; + } + + key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0); + + for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ ) + { + if ( scatter_nd_update_reduction_conv_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) ) + { + snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_reduction_conv_map[i].function_name ); + kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def; + kernel_conv->info.numParams = _SCATTER_ND_UPDATE_CONV_PARAM_NUM; + kernel_conv->info.initialize = _scatter_nd_update_conv_initializer; + + vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + scatter_nd_update_reduction_conv_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_reduction_conv_map[i].source_name ); + } + else + { + status |= VSI_FAILURE; + } + + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_t node = NULL; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t strides[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" ); + int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + int32_t i = 0; + vsi_nn_tensor_t * tensors[2] = { NULL }; + vsi_nn_kernel_t * ikernels[2] = { NULL }; + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0, + NULL, &rs_idx_dim); + status |= get_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0, + NULL, &rs_in_dim); + status |= get_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim, + strides, &rs_out_dim); + CHECK_STATUS_FAIL_GOTO( status, final ); + + { + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_node_t preprocess_node = NULL; + vsi_nn_kernel_node_t process_node = NULL; + vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL }; + int32_t width = 1; + int32_t res = 0; + + ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + ikernels[0]->unique_id = kernel->unique_id; + ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + ikernels[1]->unique_id = kernel->unique_id; + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype = outputs[0]->attr.dtype; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.is_const = FALSE; + attr.vtl = TRUE; + + for (i = 0; i < rs_out_dim; i++) + { + attr.size[i] = shapes[2][i]; + width *= (int32_t)shapes[2][i]; + } + attr.dim_num = rs_out_dim; + + res = width % 8; + width = (width >> 3) << 3; + + tensors[0] = vsi_nn_CreateTensor( graph, &attr ); // ref' + attr.size[0] = 1; + attr.size[1] = 1; + attr.dim_num = rs_out_dim; + tensors[1] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer0 + + status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction); + if ( VSI_SUCCESS == status) + { + // convert ref to float + preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] ); + if (preprocess_node) + { + uint32_t index = 0; + /* Pass parameters to node. */ + preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim ); + preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res ); + status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params, + _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &preprocess_params[0] ); + vsi_nn_kernel_scalar_release( &preprocess_params[2] ); + vsi_nn_kernel_scalar_release( &preprocess_params[3] ); + } + + // update + process_node = vsi_nn_kernel_create_node( graph, ikernels[1] ); + if (process_node) + { + uint32_t index = 0; + /* Pass parameters to node. */ + process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); + process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); + process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] ); + process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + status = vsi_nn_kernel_node_pass_param( process_node, process_params, + _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &process_params[0] ); + vsi_nn_kernel_tensor_release( &process_params[1] ); + vsi_nn_kernel_scalar_release( &process_params[4] ); + vsi_nn_kernel_scalar_release( &process_params[5] ); + vsi_nn_kernel_scalar_release( &process_params[6] ); + vsi_nn_kernel_scalar_release( &process_params[7] ); + vsi_nn_kernel_scalar_release( &process_params[8] ); + vsi_nn_kernel_scalar_release( &process_params[9] ); + vsi_nn_kernel_scalar_release( &process_params[10] ); + vsi_nn_kernel_scalar_release( &process_params[11] ); + } + + // convert float to output + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res ); + status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &conv_params[2] ); + vsi_nn_kernel_scalar_release( &conv_params[3] ); + vsi_nn_kernel_scalar_release( &conv_params[4] ); + } + } + + if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );} + if (process_node) {vsi_nn_kernel_node_release( &process_node );} + } + +final: + if (ikernels[0]) + { + vsi_nn_kernel_release(&ikernels[0]); + } + if (ikernels[1]) + { + vsi_nn_kernel_release(&ikernels[1]); + } + vsi_safe_release_tensor(tensors[0]); + vsi_safe_release_tensor(tensors[1]); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( scatter_nd_update_reduction, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c index b918e2c..4d8e90b 100644 --- a/src/tim/vx/internal/src/kernel/evis/select_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_TENSOR_SELECT_VX_SUPPORT) #include #include #include @@ -159,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_select_initializer) vsi_nn_kernel_tensor_attr_t *input1_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; - int32_t input0_fl = 0, input1_fl = 0, output_fl = 0; float input0Scale = 1.0f; int32_t input0Zp = 0; float input1Scale = 1.0f; @@ -180,59 +180,12 @@ DEF_KERNEL_INITIALIZER(_select_initializer) output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); - if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - input0_fl = input0_attr->dfp.fl; - if (input0_fl > 0) - { - input0Scale = 1.0f / (float) ((int64_t)1 << input0_fl); - } - else - { - input0Scale = (float)((int64_t)1 << -input0_fl); - } - } - else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - input0Scale = input0_attr->asymm.scale; - input0Zp = input0_attr->asymm.zero_point; - } - - if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - input1_fl = input1_attr->dfp.fl; - if (input1_fl > 0) - { - input1Scale = 1.0f / (float) ((int64_t)1 << input1_fl); - } - else - { - input1Scale = (float)((int64_t)1 << -input1_fl); - } - } - else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - input1Scale = input1_attr->asymm.scale; - input1Zp = input1_attr->asymm.zero_point; - } - - if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - output_fl = output_attr->dfp.fl; - if (output_fl > 0) - { - outputScale = 1.0f / (float) ((int64_t)1 << output_fl); - } - else - { - outputScale = (float)((int64_t)1 << -output_fl); - } - } - else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - outputScale = output_attr->asymm.scale; - outputZP = output_attr->asymm.zero_point; - } + input0Scale = input0_attr->scale; + input0Zp = input0_attr->zero_point; + input1Scale = input1_attr->scale; + input1Zp = input1_attr->zero_point; + outputScale = output_attr->scale; + outputZP = output_attr->zero_point; gpu_quantize_multiplier_16bit(input0Scale / outputScale, &in0_M0, &in0_postShift); gpu_quantize_multiplier_16bit(input1Scale / outputScale, &in1_M0, &in1_postShift); @@ -541,3 +494,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( select, _setup ) +#endif diff --git a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c index b2e22ed..dde408d 100644 --- a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c @@ -131,42 +131,10 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer) CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); out_shape = attr[1]->shape; - - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - input_zp = 0; - } - - if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - output_zp = attr[1]->asymm.zero_point; - scaleOut = 1.0f / attr[1]->asymm.scale; - } - else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[1]->dfp.fl > 0) - { - scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl); - } - else - { - scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); - } - output_zp = 0; - } + input_zp = attr[0]->zero_point; + scaleIn = attr[0]->scale; + output_zp = attr[1]->zero_point; + scaleOut = 1.0f / attr[1]->scale; outputVal1 = scaleOut + (float)output_zp; diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c index 773d38b..2362e2c 100644 --- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c @@ -157,8 +157,6 @@ DEF_KERNEL_INITIALIZER(_slice_initializer) float scaleOut = 1.0f; int32_t output_ZP = 0; int32_t input_ZP = 0; - int32_t srcFixPointPos = 0; - int32_t dstFixPointPos = 0; int32_t is_samefl = 0; uint32_t pack_key = 0; @@ -178,41 +176,10 @@ DEF_KERNEL_INITIALIZER(_slice_initializer) pack_key = _PACK_SLICE_KEY( input_dtype, output_dtype, is_samefl); - if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) - { - srcFixPointPos = input_attr->dfp.fl; - if (srcFixPointPos > 0) - { - scaleIn = (1.0f / ((float) ((int64_t)1 << srcFixPointPos))); - } - else - { - scaleIn = ((float) ((int64_t)1 << -srcFixPointPos)); - } - } - else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant) - { - input_ZP = input_attr->asymm.zero_point; - scaleIn = input_attr->asymm.scale; - } - - if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) - { - dstFixPointPos = output_attr->dfp.fl; - if (dstFixPointPos > 0) - { - scaleOut = (1.0f / ((float) ((int64_t)1 << dstFixPointPos))); - } - else - { - scaleOut = ((float) ((int64_t)1 << -dstFixPointPos)); - } - } - else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) - { - output_ZP = output_attr->asymm.zero_point; - scaleOut = output_attr->asymm.scale; - } + input_ZP = input_attr->zero_point; + scaleIn = input_attr->scale; + output_ZP = output_attr->zero_point; + scaleOut = output_attr->scale; if ((I8 == input_dtype && input_dtype == output_dtype ) || (U8 == input_dtype && input_dtype == output_dtype ) ) diff --git a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c index f31de54..eb38746 100644 --- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c @@ -170,23 +170,8 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer) attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); - if ( attr->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr->dfp.fl; - if (fl > 0) - { - input_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input_scale = (float)((int64_t)1 << -fl); - } - } - else if ( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - input_scale = attr->asymm.scale; - input_tail = 0 - attr->asymm.zero_point * input_scale; - } + input_scale = attr->scale; + input_tail = 0 - attr->zero_point * input_scale; in_shape = attr->shape; @@ -265,42 +250,10 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer) attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - int32_t fl = attr[0]->dfp.fl; - if (fl > 0) - { - input_scale = 1.0f / (float) ((int64_t)1 << fl); - } - else - { - input_scale = (float)((int64_t)1 << -fl); - } - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - input_scale = attr[0]->asymm.scale; - input_tail = 0 - attr[0]->asymm.zero_point * input_scale; - } - - if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - int32_t fl = attr[1]->dfp.fl; - - if (fl >= 0) - { - output_scale = (vx_float32) ((vx_int64)1 << fl); - } - else if (fl < 0) - { - output_scale = 1.0f / (vx_float32) ((vx_int64)1 << -fl); - } - } - else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - output_scale = 1.0f / attr[1]->asymm.scale; - output_zp = (float)attr[1]->asymm.zero_point; - } + input_scale = attr[0]->scale; + input_tail = 0 - attr[0]->zero_point * input_scale; + output_scale = 1.0f / attr[1]->scale; + output_zp = (float)attr[1]->zero_point; out_shape = attr[1]->shape; diff --git a/src/tim/vx/internal/src/kernel/evis/swish_evis.c b/src/tim/vx/internal/src/kernel/evis/swish_evis.c index befe6ac..f1ad40b 100644 --- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c @@ -166,8 +166,6 @@ DEF_KERNEL_INITIALIZER(_swish_initializer) vx_tensor input = (vx_tensor)param[0]; vx_tensor output = (vx_tensor)param[1]; - int8_t srcFixPointPos = 0; - int8_t dstFixPointPos = 0; vx_float32 inputTail = 0; vx_float32 inputScale = 1.0f; vx_float32 outputZP = 0; @@ -186,42 +184,11 @@ DEF_KERNEL_INITIALIZER(_swish_initializer) CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); out_shape = output_attr->shape; + inputScale = input_attr->scale; + inputTail = 0 - (vx_float32)input_attr->zero_point * inputScale; + outputScale = 1.0f / output_attr->scale; + outputZP = (vx_float32)(output_attr->zero_point); - if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP) - { - srcFixPointPos = (int8_t)input_attr->dfp.fl; - if (srcFixPointPos > 0) - { - inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); - } - else - { - inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos); - } - } - else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - inputScale = input_attr->asymm.scale; - inputTail = 0 - input_attr->asymm.zero_point * inputScale; - } - - if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP) - { - dstFixPointPos = (int8_t)output_attr->dfp.fl; - if (dstFixPointPos > 0) - { - outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos); - } - else - { - outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); - } - } - else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - outputScale = 1.0f / output_attr->asymm.scale; - outputZP = (vx_float32)(output_attr->asymm.zero_point); - } #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \ (IN_TYPE | ( OUT_TYPE << 16)) @@ -379,8 +346,6 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer) vx_tensor input = (vx_tensor)param[0]; vx_tensor output = (vx_tensor)param[1]; - int8_t srcFixPointPos = 0; - int8_t dstFixPointPos = 0; vx_float32 inputTail = 0; vx_float32 inputScale = 1.0f; vx_float32 outputZP = 0; @@ -398,42 +363,11 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer) CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); out_shape = output_attr->shape; + inputScale = input_attr->scale; + inputTail = 0 - (vx_float32)input_attr->zero_point * inputScale; + outputScale = 1.0f / output_attr->scale; + outputZP = (vx_float32)(output_attr->zero_point); - if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP) - { - srcFixPointPos = (int8_t)input_attr->dfp.fl; - if (srcFixPointPos > 0) - { - inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); - } - else - { - inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos); - } - } - else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - inputScale = input_attr->asymm.scale; - inputTail = 0 - input_attr->asymm.zero_point * inputScale; - } - - if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP) - { - dstFixPointPos = (int8_t)output_attr->dfp.fl; - if (dstFixPointPos > 0) - { - outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos); - } - else - { - outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); - } - } - else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM) - { - outputScale = 1.0f / output_attr->asymm.scale; - outputZP = (vx_float32)(output_attr->asymm.zero_point); - } #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \ (IN_TYPE | ( OUT_TYPE << 16)) diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c index f46941a..4fc76f9 100644 --- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c @@ -22,6 +22,7 @@ * *****************************************************************************/ +#if !(VX_TENSOR_TILE_API_SUPPORT) #include #include #include @@ -280,42 +281,10 @@ DEF_KERNEL_INITIALIZER(_tile_initializer) CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); in_shape = attr[0]->shape; - - if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant) - { - if (attr[0]->dfp.fl > 0) - { - scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - input_ZP = 0; - } - else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant) - { - input_ZP = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - - if (VSI_NN_KERNEL_QUANT_DFP == attr[1]->quant) - { - if (attr[1]->dfp.fl > 0) - { - scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl); - } - else - { - scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); - } - output_ZP = 0; - } - else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[1]->quant) - { - output_ZP = attr[1]->asymm.zero_point; - scaleOut = attr[1]->asymm.scale; - } + input_ZP = attr[0]->zero_point; + scaleIn = attr[0]->scale; + output_ZP = attr[1]->zero_point; + scaleOut = attr[1]->scale; #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \ (( IN_TYPE << 16) | ( OUT_TYPE)) @@ -626,3 +595,4 @@ final: REGISTER_BACKEND_EVIS( tile, _setup ) __END_DECLS +#endif diff --git a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c index fb78c49..e2327e3 100644 --- a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c @@ -174,41 +174,19 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer) src_dtype = input_attr->dtype; dst_dtype = output_attr->dtype; axis_dtype = axis_attr->dtype; + inputScale = input_attr->scale; + input_ZP = input_attr->zero_point; + outputScale = output_attr->scale; + output_ZP = output_attr->zero_point; if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { input_fl = input_attr->dfp.fl; - if (input_fl > 0) - { - inputScale = 1.0f / (float) ((int64_t)1 << input_fl); - } - else - { - inputScale = (float)((int64_t)1 << -input_fl); - } - } - else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - inputScale = input_attr->asymm.scale; - input_ZP = input_attr->asymm.zero_point; } if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { output_fl = output_attr->dfp.fl; - if (output_fl > 0) - { - outputScale = 1.0f / (float) ((int64_t)1 << output_fl); - } - else - { - outputScale = (float)((int64_t)1 << -output_fl); - } - } - else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) - { - outputScale = output_attr->asymm.scale; - output_ZP = output_attr->asymm.zero_point; } factorOut = 1.0f / outputScale; diff --git a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c index 6bc113f..936590c 100644 --- a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c @@ -147,8 +147,6 @@ DEF_KERNEL_INITIALIZER(_upsamplescale_initializer) float scaleOut = 1.0f; int32_t output_ZP = 0; int32_t input_ZP = 0; - int32_t srcFixPointPos = 0; - int32_t dstFixPointPos = 0; uint32_t pack_key = 0; _internal_upscale_e flag = UP_ORG; @@ -164,34 +162,10 @@ DEF_KERNEL_INITIALIZER(_upsamplescale_initializer) vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &(scale)); input_dtype = input_attr->dtype; output_dtype = output_attr->dtype; - - if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) - { - srcFixPointPos = input_attr->dfp.fl; - if (srcFixPointPos >=0 ) - scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos); - else - scaleIn = (float) ((int64_t)1 << -srcFixPointPos); - } - else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant) - { - input_ZP = input_attr->asymm.zero_point; - scaleIn = input_attr->asymm.scale; - } - - if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) - { - dstFixPointPos = output_attr->dfp.fl; - if (dstFixPointPos >=0 ) - scaleOut = 1.0f / (float) ((int64_t)1 << dstFixPointPos); - else - scaleOut = (float) ((int64_t)1 << -dstFixPointPos); - } - else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) - { - output_ZP = output_attr->asymm.zero_point; - scaleOut = output_attr->asymm.scale; - } + input_ZP = input_attr->zero_point; + scaleIn = input_attr->scale; + output_ZP = output_attr->zero_point; + scaleOut = output_attr->scale; if (stride == 2 && scale >= 0) { diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c index 547254f..8fefcb9 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c @@ -564,6 +564,11 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape temp_shape_y[temp_rank] = temp_shape_y[i]; temp_shape_output[temp_rank++] = temp_shape_output[i]; } + //Delete 1to1 dim + if (temp_rank != 1 && temp_shape_output[temp_rank - 1] == 1) + { + temp_rank --; + } } else if (temp_shape_x[i] != 1) { @@ -578,8 +583,12 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape sy *= temp_shape_y[j]; sz *= temp_shape_output[j]; } - temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output, - temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz ); + //Delete 1to1 dim + if (sz != 1) + { + temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output, + temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz ); + } idx_start = -1; } temp_shape_x[temp_rank] = temp_shape_x[i]; @@ -601,10 +610,6 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape * Skip dim if the size is equal to 1 * Also skip if ( sx == 1 && sy == 1 ) */ - if ( temp_shape_output[i] == 1 ) - { - continue; - } // Update state state = TILE_STATE_EMPTY; diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c index b9f3ff2..8dcae3c 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c @@ -250,6 +250,11 @@ static float inverse_sigmoid_eval(float x, vsi_nn_kernel_lut_params *lut_param) return log_eval(x1 / x2); } +static float tan_eval(float x) +{ + return tanf(x); +} + static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param) { float result = 0; @@ -325,6 +330,9 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params * case VSI_NN_KERNEL_LUT_INVERSE_SIGMOID: result = inverse_sigmoid_eval(data, lut_param); break; + case VSI_NN_KERNEL_LUT_TAN: + result = tan_eval(data); + break; default: VSILOGE( "unsupported activation function:%d", lut_param->act_type ); break; diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index 974ad58..b837e66 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -132,7 +132,6 @@ static vsi_status _select } \ REGISTER_KERNEL_SELECTOR( kernel_name, _##kernel_name##_kernel_selector ) -REGISTER_VX_FIRST_KERNEL_SELECTOR(exp) REGISTER_VX_FIRST_KERNEL_SELECTOR(log) REGISTER_VX_FIRST_KERNEL_SELECTOR(selu) REGISTER_VX_FIRST_KERNEL_SELECTOR(neg) @@ -153,6 +152,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(atan) REGISTER_VX_FIRST_KERNEL_SELECTOR(atanh) REGISTER_VX_FIRST_KERNEL_SELECTOR(acosh) REGISTER_VX_FIRST_KERNEL_SELECTOR(inverse_sigmoid) +REGISTER_VX_FIRST_KERNEL_SELECTOR(tan) #if (VX_TENSOR_SELECT_VX_SUPPORT) REGISTER_VX_FIRST_KERNEL_SELECTOR(select) #endif @@ -168,5 +168,19 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops) #if (VX_TENSOR_TILE_API_SUPPORT) REGISTER_VX_FIRST_KERNEL_SELECTOR(tile) #endif +#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm) +#endif +#if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(exp) +#endif +#if (VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(sin) +REGISTER_VX_FIRST_KERNEL_SELECTOR(cos) +#endif +#if (VX_LOGSOFTMAX_VX_SUPPORT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax) +#endif + __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c index 55a6100..d74d6a1 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c @@ -811,7 +811,7 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias attr.dim_num = 2; } bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias); - CHECK_PTR_FAIL_GOTO( new_bias_data_ptr, "Create buffer fail.", final ); + CHECK_PTR_FAIL_GOTO( bias_data, "ConvertTensorToData fail.", final ); } new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t)); @@ -869,3 +869,66 @@ vsi_status vsi_nn_set_sp_kernel_name return status; } +vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * in_tensor, + vsi_size_t * shape, + uint32_t dim_num, + vsi_nn_opt_direction_e direction + ) +{ + vsi_nn_tensor_t * tensor = NULL; +#if VX_REMOVE_RESHAPE_SUPPORT + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t *dims_tensor = NULL; + vx_nn_reshape_params_t reshape_param; + int32_t dims_data[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t i = 0; + vsi_nn_tensor_t * input = NULL; + vsi_nn_tensor_t * output = NULL; + + memcpy( &attr, &(in_tensor->attr), sizeof(vsi_nn_tensor_attr_t) ); + memcpy( attr.size, shape, sizeof(vsi_size_t) * dim_num); + attr.dim_num = dim_num; + tensor = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO(tensor, "Create tensor failed", final); + + for (i = 0; i < dim_num; i++) + { + dims_data[i] = (int32_t)shape[i]; + } + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = dim_num; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dims_tensor = vsi_nn_CreateTensorFromData( + graph, (uint8_t *)dims_data, &attr); + CHECK_PTR_FAIL_GOTO(dims_tensor, "Create tensor failed", final); + reshape_param.dims = REQUIRED_IO(dims_tensor); + + if (direction == VSI_NN_OPTIMIZE_BACKWARD) + { + input = in_tensor; + output = tensor; + } + else + { + input = tensor; + output = in_tensor; + } + + vxTensorReshapeNode(graph->g, input->t, &reshape_param, sizeof(reshape_param), output->t); + vsi_safe_release_tensor(dims_tensor); +#else + VSI_UNREFERENCED(direction); + tensor = vsi_nn_reshape_tensor( graph, in_tensor, shape, dim_num ); + CHECK_PTR_FAIL_GOTO(tensor, "Reshape tensor failed", final); +#endif + +final: + return tensor; +} diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c index 09514d3..1b53660 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -154,13 +154,14 @@ final: REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( mish, VSI_NN_KERNEL_LUT_MISH ) -//REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( exp, VSI_NN_KERNEL_LUT_EXP ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( log, VSI_NN_KERNEL_LUT_LOG ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( selu, VSI_NN_KERNEL_LUT_SELU ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( neg, VSI_NN_KERNEL_LUT_NEG ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_sigmoid, VSI_NN_KERNEL_LUT_HSIGMOID ) +#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( gelu, VSI_NN_KERNEL_LUT_GELU ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_gelu, VSI_NN_KERNEL_LUT_HGELU ) +#endif REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf, VSI_NN_KERNEL_LUT_ERF ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras, VSI_NN_KERNEL_LUT_RELU_KERAS ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip, VSI_NN_KERNEL_LUT_CLIP ) @@ -168,6 +169,7 @@ REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( celu, VSI_NN_KERNEL_LUT_CELU ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( rcp, VSI_NN_KERNEL_LUT_RCP ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( softsign, VSI_NN_KERNEL_LUT_SOFTSIGN ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( atan, VSI_NN_KERNEL_LUT_ATAN ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( tan, VSI_NN_KERNEL_LUT_TAN ) #undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL @@ -412,4 +414,115 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( softrelu ) return (vsi_nn_kernel_node_t)node; } /* softrelu() */ +#if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT) +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( exp ) +{ + vx_node node = NULL; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_EXP, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* exp() */ +#endif + +#if (VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT) +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sin ) +{ + vx_node node = NULL; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SIN, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* sin() */ + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( cos ) +{ + vx_node node = NULL; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_COS, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* cos() */ +#endif + +#if (VX_ACTIVATION_GELU_VX_SUPPORT_EXT) +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( gelu ) +{ + vx_node node = NULL; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_GELU, + 1, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* gelu() */ + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_gelu ) +{ + vx_node node = NULL; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HGELU, + 1, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* hard_gelu() */ +#endif + #undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c new file mode 100644 index 0000000..00a2def --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c @@ -0,0 +1,87 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) +#define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm ) +{ + vx_node node = NULL; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + + vx_tensor inputs_tensor[3] = {NULL}; + vx_tensor output_tensor = NULL; + + inputs_tensor[0] = inputs[0]->t; + inputs_tensor[1] = inputs[1]->t; + inputs_tensor[2] = inputs[2]->t; + output_tensor = outputs[0]->t; + + node = vxLayerNormalizationLayer( + graph->g, + eps, + axis, + inputs_tensor, + (uint32_t)input_num, + output_tensor + ); + + return (vsi_nn_kernel_node_t)node; +} /* layer_norm() */ + +#undef REGISTER_LAYER_NORM_OPENVX_KERNEL +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/vx/log_softmax_vx.c b/src/tim/vx/internal/src/kernel/vx/log_softmax_vx.c new file mode 100644 index 0000000..5b5f447 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/log_softmax_vx.c @@ -0,0 +1,85 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_LOGSOFTMAX_VX_SUPPORT) +#define REGISTER_LOGSOFTMAX_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_LOGSOFTMAX_OPENVX_KERNEL( log_softmax ) +{ + vx_node node = NULL; + float beta = vsi_nn_kernel_param_get_float32( params, "beta" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + vx_tensor input_tensor = NULL; + vx_tensor output_tensor = NULL; + + input_tensor = inputs[0]->t; + output_tensor = outputs[0]->t; + + node = vxLogSoftMaxLayer( + graph->g, + input_tensor, + beta, + axis, + output_tensor + ); + + return (vsi_nn_kernel_node_t)node; +} /* logsoftmax() */ + +#undef REGISTER_LOGSOFTMAX_OPENVX_KERNEL +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_bilinear.cl b/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_bilinear.cl new file mode 100644 index 0000000..7f8b1bf --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_bilinear.cl @@ -0,0 +1,107 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + + +_viv_uniform float width_scale; +_viv_uniform float height_scale; +_viv_uniform int image_width; +_viv_uniform int image_height; + +#define CROP_AND_RESIZE_BILINEAR(name, read_type, dst_type, conv_type, write_type) \ +__kernel void crop_and_resize_bilinear_##name \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout, \ + float inOutScale, \ + float inOutTile, \ + float extrapolation_value \ +) \ +{ \ + int bb = get_global_id(2); \ + int y = get_global_id(1); \ + int x = get_global_id(0); \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int2 coord_box_ind = (int2)(bb, 0); \ + int b = read_imagei(box_ind, coord_box_ind).x; \ + float4 xy; \ + float in_x, in_y; \ + int d = 0; \ + \ + Image img_boxes = create_image_from_image2d(boxes, 2); \ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \ + xy = vload_half4(bb, boxes_ptr); \ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \ + in_y = xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale; \ + in_x = xy.y * convert_float(image_width - 1) + convert_float(x) * _width_scale; \ + float y_lerp = in_y - floor(in_y); \ + float x_lerp = in_x - floor(in_x); \ + float4 src0, src1, src2, src3; \ + for (d = 0; d < ori_depth; d++) \ + { \ + int4 coord = (int4)(floor(in_x), floor(in_y), d + b * ori_depth, 0); \ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \ + { \ + src0 = (float4)(extrapolation_value,0,0,0); \ + } \ + else \ + { \ + src0 = convert_float4(read_type(input, coord)); \ + } \ + coord.x = coord.x + 1; \ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \ + { \ + src1 = (float4)(extrapolation_value,0,0,0); \ + } \ + else \ + { \ + src1 = convert_float4(read_type(input, coord)); \ + } \ + coord.y = coord.y + 1; \ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \ + { \ + src3 = (float4)(extrapolation_value,0,0,0); \ + } \ + else \ + { \ + src3 = convert_float4(read_type(input, coord)); \ + } \ + coord.x = coord.x - 1; \ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \ + { \ + src2 = (float4)(extrapolation_value,0,0,0); \ + } \ + else \ + { \ + src2 = convert_float4(read_type(input, coord)); \ + } \ + float4 top = src0 + (src1 - src0) * x_lerp; \ + float4 bottom = src2 + (src3 - src2) * x_lerp; \ + float4 value = top + (bottom - top) * y_lerp; \ + value = value * inOutScale + inOutTile; \ + dst_type dst = conv_type(value); \ + coord_out.z = d + coord_out.z * ori_depth; \ + write_type(output, coord_out, dst); \ + } \ +} + +CROP_AND_RESIZE_BILINEAR(U32toU32,read_imageui, \ +uint4, convert_uint4, write_imageui) +CROP_AND_RESIZE_BILINEAR(U32toF32,read_imageui, \ +float4,convert_float4,write_imagef) +CROP_AND_RESIZE_BILINEAR(F32toF32,read_imagef, \ +float4, convert_float4,write_imagef) +CROP_AND_RESIZE_BILINEAR(F32toU32,read_imagef, \ +uint4, convert_uint4, write_imageui) +CROP_AND_RESIZE_BILINEAR(F32toI32,read_imagef, \ +int4, convert_int4, write_imagei) +CROP_AND_RESIZE_BILINEAR(I32toI32,read_imagei, \ +int4, convert_int4, write_imagei) +CROP_AND_RESIZE_BILINEAR(I32toF32,read_imagei, \ +float4,convert_float4,write_imagef) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_nearest_neighbor.cl b/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_nearest_neighbor.cl new file mode 100644 index 0000000..e1f93c5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_nearest_neighbor.cl @@ -0,0 +1,77 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + + +_viv_uniform float width_scale; +_viv_uniform float height_scale; +_viv_uniform int image_width; +_viv_uniform int image_height; + +#define CROP_AND_RESIZE_NEAREST_NEIGHTBOR(name,src_type, read_type, dst_type, conv_type, write_type) \ +__kernel void crop_and_resize_nearest_neighbor_##name \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout, \ + float inOutScale, \ + float inOutTile, \ + float extrapolation_value \ +) \ +{ \ + int bb = get_global_id(2); \ + int y = get_global_id(1); \ + int x = get_global_id(0); \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int2 coord_box_ind = (int2)(bb, 0); \ + int b = read_imagei(box_ind, coord_box_ind).x; \ + float4 xy; \ + int in_x, in_y, d = 0; \ + \ + Image img_boxes = create_image_from_image2d(boxes, 2); \ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \ + xy = vload_half4(bb, boxes_ptr); \ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) \ + + convert_float(y) * _height_scale)); \ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) \ + + convert_float(x) * _width_scale)); \ + for (d = 0; d < ori_depth; d++) \ + { \ + int4 coord = (int4)(in_x, in_y, d + b * ori_depth, 0); \ + float4 src_f; \ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \ + { \ + src_f = (float4)(extrapolation_value, 0, 0, 0); \ + } \ + else \ + { \ + src_type src = read_type(input, coord); \ + src_f = convert_float4(src); \ + } \ + src_f = src_f * inOutScale + inOutTile; \ + dst_type dst = conv_type(src_f); \ + coord_out.z = d + coord_out.z * ori_depth; \ + write_type(output, coord_out, dst); \ + } \ +} + +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(U32toU32,uint4, \ +read_imageui, uint4, convert_uint4, write_imageui) +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(U32toF32,uint4, \ +read_imageui, float4,convert_float4,write_imagef) +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toF32,float4, \ +read_imagef, float4,convert_float4,write_imagef) +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toU32,float4, \ +read_imagef, uint4, convert_uint4, write_imageui) +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toI32,float4, \ +read_imagef, int4, convert_int4, write_imagei) +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(I32toI32,int4, \ +read_imagei, int4, convert_int4, write_imagei) +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(I32toF32,int4, \ +read_imagei, float4,convert_float4,write_imagef) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl b/src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl deleted file mode 100644 index 66a7fcb..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl +++ /dev/null @@ -1,101 +0,0 @@ -float exp_(float x, float logE) -{ - x *= logE; - x = exp2(x); - return x; -} - -__kernel void detect_post_box_F32_F32toF32( - __read_only image2d_array_t input0, - __read_only image2d_t input1, - __write_only image2d_array_t output, - float inv_scale_y, - float inv_scale_x, - float inv_scale_h, - float inv_scale_w, - float logE) -{ - int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); - float4 src0; - float4 src1; - float4 dst; - float4 tmp0, tmp1; - src0.x = read_imagef(input0, coord).x; - src1.x = read_imagef(input1, coord.xy).x; - coord.x++; - src0.y = read_imagef(input0, coord).x; - src1.y = read_imagef(input1, coord.xy).x; - coord.x++; - src0.z = read_imagef(input0, coord).x; - src1.z = read_imagef(input1, coord.xy).x; - coord.x++; - src0.w = read_imagef(input0, coord).x; - src1.w = read_imagef(input1, coord.xy).x; - - tmp0.x = src1.x + src1.z * src0.x * inv_scale_y; - tmp0.y = src1.y + src1.w * src0.y * inv_scale_x; - tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f; - tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f; - dst.xy = tmp0.xy - tmp1.xy; - dst.zw = tmp0.xy + tmp1.xy; - coord.x = 0; - write_imagef(output, coord, dst.xxxx); - coord.x++; - write_imagef(output, coord, dst.yyyy); - coord.x++; - write_imagef(output, coord, dst.zzzz); - coord.x++; - write_imagef(output, coord, dst.wwww); -} - - -__kernel void detect_post_box_U8_U8toF32( - __read_only image2d_array_t input0, - __read_only image2d_t input1, - __write_only image2d_array_t output, - float inv_scale_y, - float inv_scale_x, - float inv_scale_h, - float inv_scale_w, - float logE, - float input0Tail, - float input1Tail, - float input0Scale, - float input1Scale) -{ - int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); - uint4 in0, in1; - float4 src0; - float4 src1; - float4 dst; - float4 tmp0, tmp1; - in0.x = read_imageui(input0, coord).x; - in1.x = read_imageui(input1, coord.xy).x; - coord.x++; - in0.y = read_imageui(input0, coord).x; - in1.y = read_imageui(input1, coord.xy).x; - coord.x++; - in0.z = read_imageui(input0, coord).x; - in1.z = read_imageui(input1, coord.xy).x; - coord.x++; - in0.w = read_imageui(input0, coord).x; - in1.w = read_imageui(input1, coord.xy).x; - - src0 = convert_float4(in0) * input0Scale + input0Tail; - src1 = convert_float4(in1) * input1Scale + input1Tail; - - tmp0.x = src1.x + src1.z * src0.x * inv_scale_y; - tmp0.y = src1.y + src1.w * src0.y * inv_scale_x; - tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f; - tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f; - dst.xy = tmp0.xy - tmp1.xy; - dst.zw = tmp0.xy + tmp1.xy; - coord.x = 0; - write_imagef(output, coord, dst.xxxx); - coord.x++; - write_imagef(output, coord, dst.yyyy); - coord.x++; - write_imagef(output, coord, dst.zzzz); - coord.x++; - write_imagef(output, coord, dst.wwww); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl index e836a48..d9cc57a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl @@ -176,6 +176,11 @@ float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta) return log(x1 / x2); } +float eltwise_unary_tan(float x, float alpha, float beta) +{ + return native_tan(x); +} + #define ELTWISE_UNARY_F32_2D(func_name) \ __kernel void func_name##_F32toF32_2D \ @@ -218,6 +223,7 @@ ELTWISE_UNARY_F32_2D(atan) ELTWISE_UNARY_F32_2D(atanh) ELTWISE_UNARY_F32_2D(acosh) ELTWISE_UNARY_F32_2D(inverse_sigmoid) +ELTWISE_UNARY_F32_2D(tan) #define ELTWISE_UNARY_U8_2D(func_name) \ __kernel void func_name##_U8toU8_2D \ @@ -261,6 +267,7 @@ ELTWISE_UNARY_U8_2D(atan) ELTWISE_UNARY_U8_2D(atanh) ELTWISE_UNARY_U8_2D(acosh) ELTWISE_UNARY_U8_2D(inverse_sigmoid) +ELTWISE_UNARY_U8_2D(tan) #define ELTWISE_UNARY_U8toF32_2D(func_name) \ __kernel void func_name##_U8toF32_2D \ @@ -303,6 +310,7 @@ ELTWISE_UNARY_U8toF32_2D(atan) ELTWISE_UNARY_U8toF32_2D(atanh) ELTWISE_UNARY_U8toF32_2D(acosh) ELTWISE_UNARY_U8toF32_2D(inverse_sigmoid) +ELTWISE_UNARY_U8toF32_2D(tan) __kernel void neg_I32toI32_2D ( diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl index 2adf398..767e8c5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl @@ -175,6 +175,11 @@ float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta) return log(x1 / x2); } +float eltwise_unary_tan(float x, float alpha, float beta) +{ + return native_tan(x); +} + #define ELTWISE_UNARY_F32(func_name) \ __kernel void func_name##_F32toF32 \ ( \ @@ -216,6 +221,7 @@ ELTWISE_UNARY_F32(atan) ELTWISE_UNARY_F32(atanh) ELTWISE_UNARY_F32(acosh) ELTWISE_UNARY_F32(inverse_sigmoid) +ELTWISE_UNARY_F32(tan) #define ELTWISE_UNARY_U8(func_name) \ __kernel void func_name##_U8toU8 \ @@ -259,6 +265,7 @@ ELTWISE_UNARY_U8(atan) ELTWISE_UNARY_U8(atanh) ELTWISE_UNARY_U8(acosh) ELTWISE_UNARY_U8(inverse_sigmoid) +ELTWISE_UNARY_U8(tan) #define ELTWISE_UNARY_U8toF32(func_name) \ __kernel void func_name##_U8toF32 \ @@ -301,6 +308,7 @@ ELTWISE_UNARY_U8toF32(atan) ELTWISE_UNARY_U8toF32(atanh) ELTWISE_UNARY_U8toF32(acosh) ELTWISE_UNARY_U8toF32(inverse_sigmoid) +ELTWISE_UNARY_U8toF32(tan) __kernel void neg_I32toI32 ( diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl index dd2e562..8215ee7 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl @@ -31,7 +31,8 @@ __kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \ __read_only image2d_t hstate_h_conv, \ __write_only image2d_t output, \ __write_only image2d_t hstate_out, \ - float input_scale, float input_tail, float output_scale, float output_zp) \ + float input_scale, float input_tail, float output_scale, float output_zp, \ + float output_scale1, float output_zp1) \ { \ int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ float4 src0, src1, src2, src3; \ @@ -48,10 +49,12 @@ __kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \ z.x = act_func(z.x); \ h = tanh_func(h.x); \ float4 dst = (1 - z ) * h + z * h_tm; \ - dst = dst * output_scale + output_zp; \ - uint4 result = convert_uint4_sat_rte(dst); \ + float4 out0 = dst * output_scale + output_zp; \ + float4 out1 = dst * output_scale1 + output_zp1; \ + uint4 result = convert_uint4_sat_rte(out0); \ + uint4 result1 = convert_uint4_sat_rte(out1); \ write_imageui(output, coord_in.xy, result); \ - write_imageui(hstate_out, coord_in.xy, result); \ + write_imageui(hstate_out, coord_in.xy, result1); \ } GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid) //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) @@ -65,7 +68,8 @@ __kernel void grucell_activation_z_h_F32_F32toF32_##act_name( \ __read_only image2d_t hstate_h_conv, \ __write_only image2d_t output, \ __write_only image2d_t hstate_out, \ - float input_scale, float input_tail, float output_scale, float output_zp) \ + float input_scale, float input_tail, float output_scale, float output_zp, \ + float output_scale1, float output_zp1) \ { \ int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ float4 src0, src1, src2, src3; \ @@ -97,7 +101,8 @@ __kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \ __read_only image2d_t hstate_h_conv, \ __write_only image2d_t output, \ __write_only image2d_t hstate_out, \ - float input_scale, float input_tail, float output_scale, float output_zp) \ + float input_scale, float input_tail, float output_scale, float output_zp, \ + float output_scale1, float output_zp1) \ { \ int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ float4 src0, src1, src2, src3; \ @@ -114,10 +119,12 @@ __kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \ z.x = act_func(z.x); \ h = tanh_func(h.x); \ float4 dst = (1 - z ) * h + z * h_tm; \ - dst = dst * output_scale + output_zp; \ - int4 result = convert_int4_sat_rte(dst); \ + float4 out0 = dst * output_scale + output_zp; \ + float4 out1 = dst * output_scale1 + output_zp1; \ + int4 result = convert_int4_sat_rte(out0); \ + int4 result1 = convert_int4_sat_rte(out1); \ write_imagei(output, coord_in.xy, result); \ - write_imagei(hstate_out, coord_in.xy, result); \ + write_imagei(hstate_out, coord_in.xy, result1); \ } GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid) -//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) \ No newline at end of file +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl index a47b32d..45a6c23 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl @@ -21,6 +21,12 @@ float tanh_func(float x) return 2 * x - 1; } +float relu_func(float x) +{ + x = x > 0 ? x : 0; + return x; +} + #define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \ __kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \ @@ -62,6 +68,7 @@ __kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \ } GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid) //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) +GRUCELL_ACTIVATION_U8_F32_U8(RELU, relu_func) #define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \ __kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \ @@ -101,6 +108,7 @@ __kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \ GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid) //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) +GRUCELL_ACTIVATION_F32_F32_F32(RELU, relu_func) #define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \ __kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \ @@ -141,4 +149,5 @@ __kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \ write_imagei(hstate_out, coord_in.xy, result); \ } GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid) -//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) \ No newline at end of file +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) +GRUCELL_ACTIVATION_I32_F32_I32(RELU, relu_func) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis0.cl new file mode 100644 index 0000000..e9b8d76 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis0.cl @@ -0,0 +1,167 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; + + +#define rlogE (0.693147182f) +float LOG(float x) +{ + x = log2(x); + return x * rlogE; +} + +__kernel void log_softmax_exceed_axis0_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int z = get_global_id(1); + int4 coord_in = (int4)(0, 0, z, 0); + float4 maxValue; + float4 src, dst = {0.0}; + + maxValue = read_imagef(input, coord_in); + for (coord_in.x = 0; coord_in.x < width; coord_in.x++) + { + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + src = read_imagef(input, coord_in); + maxValue = maxValue > src ? maxValue : src; + } + } + + // Compute sum. + float sum = 0.f; + for (coord_in.x = 0; coord_in.x < width; coord_in.x++) + { + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + src = read_imagef(input, coord_in); + sum += exp2((src.x - maxValue.x) * scale); + } + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.x = 0; coord_in.x < width; coord_in.x++) + { + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + src = read_imagef(input, coord_in); + dst.x = (src.x - maxValue.x) * beta - logSum; + write_imagef(output, coord_in, dst); + } + } +} + +__kernel void log_softmax_exceed_axis0_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int z = get_global_id(1); + int4 coord_in = (int4)(0, 0, z, 0); + float4 maxValue; + float4 src; + uint4 dst = {0}; + + maxValue = convert_float4(read_imageui(input, coord_in)); + for (coord_in.x = 0; coord_in.x < width; coord_in.x++) + { + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + src = convert_float4(read_imageui(input, coord_in)); + maxValue = maxValue > src ? maxValue : src; + } + } + + // Compute sum. + float sum = 0.f; + for (coord_in.x = 0; coord_in.x < width; coord_in.x++) + { + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + src = convert_float4(read_imageui(input, coord_in)); + sum += exp2((src.x - maxValue.x) * scale); + } + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.x = 0; coord_in.x < width; coord_in.x++) + { + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + src = convert_float4(read_imageui(input, coord_in)); + + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut); + + write_imageui(output, coord_in, dst); + } + } +} + + +__kernel void log_softmax_exceed_axis0_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int z = get_global_id(1); + int4 coord_in = (int4)(0, 0, z, 0); + float4 maxValue, src, dst = {0.0}; + uint4 data, val, out; + + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, maxValue, data, 16); + for (coord_in.x = 0; coord_in.x < width; coord_in.x++) + { + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + maxValue = maxValue > src ? maxValue : src; + } + } + + float sum = 0.f; + for (coord_in.x = 0; coord_in.x < width; coord_in.x++) + { + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + sum += exp2((src.x - maxValue.x) * scale); + } + } + + float logSum = LOG(sum); + for (coord_in.x = 0; coord_in.x < width; coord_in.x++) + { + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + dst.x = (src.x - maxValue.x) * beta - logSum; + _viv_asm(COPY, val, dst, 16); + out = val >> 16; + write_imageui(output, coord_in, out); + } + } +} +#undef rlogE diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis1.cl new file mode 100644 index 0000000..f6d0afc --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis1.cl @@ -0,0 +1,172 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + +_viv_uniform int height; +_viv_uniform int depth; + +#define rlogE (0.693147182f) + +float LOG(float x) +{ + x = log2(x); + return x * rlogE; +} + +__kernel void log_softmax_exceed_axis1_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int4 coord_in = (int4)(x, 0, 0, 0); + float4 maxValue; + float4 src, dst = {0.0}; + + maxValue = read_imagef(input, coord_in); + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++) + { + src = read_imagef(input, coord_in); + maxValue = maxValue > src ? maxValue : src; + } + } + + // Compute sum. + float sum = 0.f; + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++) + { + src = read_imagef(input, coord_in); + sum += exp2((src.x - maxValue.x) * scale); + } + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++) + { + src = read_imagef(input, coord_in); + + dst.x = (src.x - maxValue.x) * beta - logSum; + write_imagef(output, coord_in, dst); + } + } +} + +__kernel void log_softmax_exceed_axis1_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int4 coord_in = (int4)(x, 0, 0, 0); + float4 maxValue; + float4 src; + uint4 dst = {0}; + + maxValue = convert_float4(read_imageui(input, coord_in)); + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++) + { + src = convert_float4(read_imageui(input, coord_in)); + + maxValue = maxValue > src ? maxValue : src; + } + } + + // Compute sum. + float sum = 0.f; + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++) + { + src = convert_float4(read_imageui(input, coord_in)); + + sum += exp2((src.x - maxValue.x) * scale); + } + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++) + { + src = convert_float4(read_imageui(input, coord_in)); + + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut); + + write_imageui(output, coord_in, dst); + } + } +} + +__kernel void log_softmax_exceed_axis1_BF16oBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int4 coord_in = (int4)(x, 0, 0, 0); + float4 maxValue, src, dst = {0.0}; + uint4 data, val, out; + + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, maxValue, data, 16); + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + maxValue = maxValue > src ? maxValue : src; + } + } + + float sum = 0.f; + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + sum += exp2((src.x - maxValue.x) * scale); + } + } + + float logSum = LOG(sum); + for (coord_in.y = 0; coord_in.y < height; coord_in.y++) + { + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + dst.x = (src.x - maxValue.x) * beta - logSum; + + _viv_asm(COPY, val, dst, 16); + out = val >> 16; + + write_imageui(output, coord_in, out); + } + } +} + +#undef rlogE diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl index e4cc547..4de7918 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl @@ -123,5 +123,133 @@ __kernel void gemm_4x_transa_F32F32toF32_2D( } +__kernel __attribute__((reqd_work_group_size(1, 64, 1))) + void gemm_4x_transa_local_F32F32toF32_2D( + __read_only image2d_t inputA, + __read_only image2d_t inputB, + __write_only image2d_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out + ) +{ + int offset0 = get_global_id(0); + int lid = get_local_id(1); + int stride = 0; + + int z = 0; + int offset1 = M << 2; + int step = K >> 8; + int lid2 = lid * 4 * step; + + Image in0_tensor = create_image_from_image2d(inputA, 4); + __global float* in0_ptr0 = (__global float*)in0_tensor.ptr + offset0 + lid2 * M; + __global float* in0_ptr1 = in0_ptr0 + M; + __global float* in0_ptr2 = in0_ptr1 + M; + __global float* in0_ptr3 = in0_ptr2 + M; + + Image in1_tensor = create_image_from_image2d(inputB, 4); + __global float* in1_ptr = (__global float*)in1_tensor.ptr + lid2; + + Image o_tensor = create_image_from_image2d(output, 4); + __global float* output_ptr = (__global float*)o_tensor.ptr + offset0; + + __local float4 sum_vec4_0[64]; + __local float4 sum_vec4_1[64]; + __local float4 sum_vec4_2[64]; + __local float4 sum_vec4_3[64]; + + float4 sum0 = (float4)(0.0, 0.0, 0.0, 0.0); + float4 sum1 = (float4)(0.0, 0.0, 0.0, 0.0); + float4 sum2 = (float4)(0.0, 0.0, 0.0, 0.0); + float4 sum3 = (float4)(0.0, 0.0, 0.0, 0.0); + + float4 tempA0, tempA1, tempA2, tempA3; + float4 tempA4, tempA5, tempA6, tempA7; + float4 tempB0; + + for(z = 0; z < step; z++) + { + tempB0 = vload4(z, in1_ptr); + tempA0 = vload4(0, in0_ptr0); + tempA1 = vload4(0, in0_ptr1); + tempA2 = vload4(0, in0_ptr2); + tempA3 = vload4(0, in0_ptr3); + tempA4 = vload4(1, in0_ptr0); + tempA5 = vload4(1, in0_ptr1); + tempA6 = vload4(1, in0_ptr2); + tempA7 = vload4(1, in0_ptr3); + + sum0 = sum0 + tempA0 * tempB0.x; + sum0 = sum0 + tempA1 * tempB0.y; + sum0 = sum0 + tempA2 * tempB0.z; + sum0 = sum0 + tempA3 * tempB0.w; + sum1 = sum1 + tempA4 * tempB0.x; + sum1 = sum1 + tempA5 * tempB0.y; + sum1 = sum1 + tempA6 * tempB0.z; + sum1 = sum1 + tempA7 * tempB0.w; + + tempA0 = vload4(2, in0_ptr0); + tempA1 = vload4(2, in0_ptr1); + tempA2 = vload4(2, in0_ptr2); + tempA3 = vload4(2, in0_ptr3); + tempA4 = vload4(3, in0_ptr0); + tempA5 = vload4(3, in0_ptr1); + tempA6 = vload4(3, in0_ptr2); + tempA7 = vload4(3, in0_ptr3); + + in0_ptr0 = in0_ptr0 + offset1; + in0_ptr1 = in0_ptr1 + offset1; + in0_ptr2 = in0_ptr2 + offset1; + in0_ptr3 = in0_ptr3 + offset1; + + sum2 = sum2 + tempA0 * tempB0.x; + sum2 = sum2 + tempA1 * tempB0.y; + sum2 = sum2 + tempA2 * tempB0.z; + sum2 = sum2 + tempA3 * tempB0.w; + sum3 = sum3 + tempA4 * tempB0.x; + sum3 = sum3 + tempA5 * tempB0.y; + sum3 = sum3 + tempA6 * tempB0.z; + sum3 = sum3 + tempA7 * tempB0.w; + } + sum_vec4_0[lid] = sum0; + sum_vec4_1[lid] = sum1; + sum_vec4_2[lid] = sum2; + sum_vec4_3[lid] = sum3; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (stride = 32; stride > 0; stride >>= 1) + { + if (lid < stride) + { + sum_vec4_0[lid] += sum_vec4_0[lid + stride]; + sum_vec4_1[lid] += sum_vec4_1[lid + stride]; + sum_vec4_2[lid] += sum_vec4_2[lid + stride]; + sum_vec4_3[lid] += sum_vec4_3[lid + stride]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (lid == 0) + { + sum0 = sum_vec4_0[0]; + sum1 = sum_vec4_1[0]; + sum2 = sum_vec4_2[0]; + sum3 = sum_vec4_3[0]; + vstore4(sum0, 0, output_ptr); + vstore4(sum1, 1, output_ptr); + vstore4(sum2, 2, output_ptr); + vstore4(sum3, 3, output_ptr); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_cubic.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_cubic.cl new file mode 100644 index 0000000..7cdce1c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_cubic.cl @@ -0,0 +1,195 @@ +__kernel void resize_cubic_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float half_pixel_value + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float cubic_coeffs_y[4] = {0,0,0,0}; + float cubic_coeffs_x[4] = {0,0,0,0}; + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; + float left_x_f = floor(in_x); + float4 delta_x = (float4)(0, in_x - left_x_f,0,0); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; + float top_y_f = floor(in_y); + float4 delta_y = (float4)(0, in_y - top_y_f,0,0); + int x_idx = convert_int(left_x_f - 1); + int y_idx = convert_int(top_y_f - 1); + int4 coord_in = (int4)(x_idx, y_idx, coord_out.z, 0); + float data00, data01, data02, data03, data10, data11, data12, data13, + data20, data21, data22, data23, data30, data31, data32, data33; + + delta_x.x = 1 + delta_x.y; + delta_x.z = 1 - delta_x.y; + delta_x.w = 2 - delta_x.y; + cubic_coeffs_x[0] = -0.5 * ((((delta_x.x - 5) * delta_x.x + 8) * delta_x.x) - 4); + cubic_coeffs_x[1] = (1.5 * delta_x.y - 2.5) * delta_x.y * delta_x.y + 1; + cubic_coeffs_x[2] = (1.5 * delta_x.z - 2.5) * delta_x.z * delta_x.z + 1; + cubic_coeffs_x[3] = -0.5 * ((((delta_x.w - 5) * delta_x.w + 8) * delta_x.w) - 4); + delta_y.x = 1 + delta_y.y; + delta_y.z = 1 - delta_y.y; + delta_y.w = 2 - delta_y.y; + cubic_coeffs_y[0] = -0.5 * ((((delta_y.x - 5) * delta_y.x + 8) * delta_y.x) - 4); + cubic_coeffs_y[1] = (1.5 * delta_y.y - 2.5) * delta_y.y * delta_y.y + 1; + cubic_coeffs_y[2] = (1.5 * delta_y.z - 2.5) * delta_y.z * delta_y.z + 1; + cubic_coeffs_y[3] = -0.5 * ((((delta_y.w - 5) * delta_y.w + 8) * delta_y.w) - 4); + float4 dst = (float4)(0,0,0,0); + + data00 = read_imagef(input, coord_in).x; + coord_in.x++; + data10 = read_imagef(input, coord_in).x; + coord_in.x++; + data20 = read_imagef(input, coord_in).x; + coord_in.x++; + data30 = read_imagef(input, coord_in).x; + + coord_in.y++; + data31 = read_imagef(input, coord_in).x; + coord_in.x--; + data21 = read_imagef(input, coord_in).x; + coord_in.x--; + data11 = read_imagef(input, coord_in).x; + coord_in.x--; + data01 = read_imagef(input, coord_in).x; + + coord_in.y++; + data02 = read_imagef(input, coord_in).x; + coord_in.x++; + data12 = read_imagef(input, coord_in).x; + coord_in.x++; + data22 = read_imagef(input, coord_in).x; + coord_in.x++; + data32 = read_imagef(input, coord_in).x; + + coord_in.y++; + data33 = read_imagef(input, coord_in).x; + coord_in.x--; + data23 = read_imagef(input, coord_in).x; + coord_in.x--; + data13 = read_imagef(input, coord_in).x; + coord_in.x--; + data03 = read_imagef(input, coord_in).x; + + dst.x = data00 * cubic_coeffs_x[0] * cubic_coeffs_y[0] + + data01 * cubic_coeffs_x[0] * cubic_coeffs_y[1] + + data02 * cubic_coeffs_x[0] * cubic_coeffs_y[2] + + data03 * cubic_coeffs_x[0] * cubic_coeffs_y[3] + + data10 * cubic_coeffs_x[1] * cubic_coeffs_y[0] + + data11 * cubic_coeffs_x[1] * cubic_coeffs_y[1] + + data12 * cubic_coeffs_x[1] * cubic_coeffs_y[2] + + data13 * cubic_coeffs_x[1] * cubic_coeffs_y[3] + + data20 * cubic_coeffs_x[2] * cubic_coeffs_y[0] + + data21 * cubic_coeffs_x[2] * cubic_coeffs_y[1] + + data22 * cubic_coeffs_x[2] * cubic_coeffs_y[2] + + data23 * cubic_coeffs_x[2] * cubic_coeffs_y[3] + + data30 * cubic_coeffs_x[3] * cubic_coeffs_y[0] + + data31 * cubic_coeffs_x[3] * cubic_coeffs_y[1] + + data32 * cubic_coeffs_x[3] * cubic_coeffs_y[2] + + data33 * cubic_coeffs_x[3] * cubic_coeffs_y[3]; + + write_imagef(output, coord_out, dst); + +} + + +__kernel void resize_cubic_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float half_pixel_value, + float in_scale, + float in_tail, + float out_scale, + float out_tail + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float cubic_coeffs_y[4] = {0,0,0,0}; + float cubic_coeffs_x[4] = {0,0,0,0}; + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; + float left_x_f = floor(in_x); + float4 delta_x = (float4)(0, in_x - left_x_f,0,0); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; + float top_y_f = floor(in_y); + float4 delta_y = (float4)(0, in_y - top_y_f,0,0); + int x_idx = convert_int(left_x_f - 1); + int y_idx = convert_int(top_y_f - 1); + int4 coord_in = (int4)(x_idx, y_idx, coord_out.z, 0); + float data00, data01, data02, data03, data10, data11, data12, data13, + data20, data21, data22, data23, data30, data31, data32, data33; + + delta_x.x = 1 + delta_x.y; + delta_x.z = 1 - delta_x.y; + delta_x.w = 2 - delta_x.y; + cubic_coeffs_x[0] = -0.5 * ((((delta_x.x - 5) * delta_x.x + 8) * delta_x.x) - 4); + cubic_coeffs_x[1] = (1.5 * delta_x.y - 2.5) * delta_x.y * delta_x.y + 1; + cubic_coeffs_x[2] = (1.5 * delta_x.z - 2.5) * delta_x.z * delta_x.z + 1; + cubic_coeffs_x[3] = -0.5 * ((((delta_x.w - 5) * delta_x.w + 8) * delta_x.w) - 4); + delta_y.x = 1 + delta_y.y; + delta_y.z = 1 - delta_y.y; + delta_y.w = 2 - delta_y.y; + cubic_coeffs_y[0] = -0.5 * ((((delta_y.x - 5) * delta_y.x + 8) * delta_y.x) - 4); + cubic_coeffs_y[1] = (1.5 * delta_y.y - 2.5) * delta_y.y * delta_y.y + 1; + cubic_coeffs_y[2] = (1.5 * delta_y.z - 2.5) * delta_y.z * delta_y.z + 1; + cubic_coeffs_y[3] = -0.5 * ((((delta_y.w - 5) * delta_y.w + 8) * delta_y.w) - 4); + float dst = 0; + uint4 out = (uint4)(0,0,0,0); + + data00 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x++; + data10 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x++; + data20 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x++; + data30 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + + coord_in.y++; + data31 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x--; + data21 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x--; + data11 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x--; + data01 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + + coord_in.y++; + data02 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x++; + data12 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x++; + data22 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x++; + data32 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + + coord_in.y++; + data33 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x--; + data23 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x--; + data13 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + coord_in.x--; + data03 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail; + + dst = data00 * cubic_coeffs_x[0] * cubic_coeffs_y[0] + + data01 * cubic_coeffs_x[0] * cubic_coeffs_y[1] + + data02 * cubic_coeffs_x[0] * cubic_coeffs_y[2] + + data03 * cubic_coeffs_x[0] * cubic_coeffs_y[3] + + data10 * cubic_coeffs_x[1] * cubic_coeffs_y[0] + + data11 * cubic_coeffs_x[1] * cubic_coeffs_y[1] + + data12 * cubic_coeffs_x[1] * cubic_coeffs_y[2] + + data13 * cubic_coeffs_x[1] * cubic_coeffs_y[3] + + data20 * cubic_coeffs_x[2] * cubic_coeffs_y[0] + + data21 * cubic_coeffs_x[2] * cubic_coeffs_y[1] + + data22 * cubic_coeffs_x[2] * cubic_coeffs_y[2] + + data23 * cubic_coeffs_x[2] * cubic_coeffs_y[3] + + data30 * cubic_coeffs_x[3] * cubic_coeffs_y[0] + + data31 * cubic_coeffs_x[3] * cubic_coeffs_y[1] + + data32 * cubic_coeffs_x[3] * cubic_coeffs_y[2] + + data33 * cubic_coeffs_x[3] * cubic_coeffs_y[3]; + out.x = convert_uint(dst * out_scale + out_tail); + + write_imageui(output, coord_out, out); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction.cl new file mode 100644 index 0000000..ff57204 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction.cl @@ -0,0 +1,203 @@ + +inline void AtomicAdd_float(volatile __global float *source, const float operand) +{ + union + { + unsigned int intVal; + float floatVal; + } newVal; + union + { + unsigned int intVal; + float floatVal; + } prevVal; + do + { + prevVal.floatVal = *source; + newVal.floatVal = prevVal.floatVal + operand; + } while(atomic_cmpxchg((volatile __global unsigned int *)source, + prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +inline void AtomicMul_float(volatile __global float *source, const float operand) +{ + union + { + unsigned int intVal; + float floatVal; + } newVal; + union + { + unsigned int intVal; + float floatVal; + } prevVal; + do + { + prevVal.floatVal = *source; + newVal.floatVal = prevVal.floatVal * operand; + } while(atomic_cmpxchg((volatile __global unsigned int *)source, + prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +inline void AtomicMax_float(volatile __global float *source, const float operand) +{ + union + { + unsigned int intVal; + float floatVal; + } newVal; + union + { + unsigned int intVal; + float floatVal; + } prevVal; + do + { + prevVal.floatVal = *source; + newVal.floatVal = fmax(prevVal.floatVal, operand); + } while(atomic_cmpxchg((volatile __global unsigned int *)source, + prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +inline void AtomicMin_float(volatile __global float *source, const float operand) +{ + union + { + unsigned int intVal; + float floatVal; + } newVal; + union + { + unsigned int intVal; + float floatVal; + } prevVal; + do + { + prevVal.floatVal = *source; + newVal.floatVal = fmin(prevVal.floatVal, operand); + } while(atomic_cmpxchg((volatile __global unsigned int *)source, + prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +#define SCATTER_REDUCTION_PREPROCESS(name0, ptr0, type0, size0, ptr2) \ +__kernel void scatter_nd_update_reduction_preprocess_##name0( \ + __read_only image2d_t input_ref, \ + image2d_t temp_buf_float, \ + int length, int res, float input_scale, float zp_scale) \ +{ \ + int gidx = get_global_id(0); \ + Image img1 = create_image_from_image2d(input_ref, size0); \ + Image img2 = create_image_from_image2d(temp_buf_float, 4); \ + __global float* tmp_ref_ptr = (__global float*)img2.ptr; \ + type0 src0, src1; \ + float4 tmpDst0, tmpDst1; \ + __global ptr2* input_ptr = (__global ptr2*)img1.ptr; \ + if(length > 0) \ + { \ + int loc2 = gidx * 8; \ + ptr0 tmpData0 = vload4(0, input_ptr + loc2); \ + ptr0 tmpData1 = vload4(1, input_ptr + loc2); \ + _viv_asm(COPY, src0, tmpData0, 16); \ + _viv_asm(COPY, src1, tmpData1, 16); \ + _viv_asm(CONV, tmpDst0, src0); \ + _viv_asm(CONV, tmpDst1, src1); \ + tmpDst0 = tmpDst0 * input_scale + zp_scale; \ + tmpDst1 = tmpDst1 * input_scale + zp_scale; \ + vstore4(tmpDst0, 0, tmp_ref_ptr + loc2); \ + vstore4(tmpDst1, 1, tmp_ref_ptr + loc2); \ + } \ + for(int i = gidx; i < res; i += get_global_size(0)) \ + { \ + ptr2 tmpData0 = input_ptr[length + i]; \ + _viv_asm(COPY, src0, tmpData0, 4); \ + _viv_asm(CONV, tmpDst0, src0); \ + tmpDst0.x = tmpDst0.x * input_scale + zp_scale; \ + tmp_ref_ptr[length + i] = tmpDst0.x; \ + } \ +} +SCATTER_REDUCTION_PREPROCESS(U8, uchar4, uchar4, 1, uchar) +SCATTER_REDUCTION_PREPROCESS(I8, char4, char4, 1, char) +SCATTER_REDUCTION_PREPROCESS(I16, short4, short4, 2, short) +SCATTER_REDUCTION_PREPROCESS(F16, short4, half4, 2, short) +SCATTER_REDUCTION_PREPROCESS(F32, float4, float4, 4, float) + +#define SCATTER_ND_REDUCTION_PROCESS_F16(name0, func) \ +__kernel void scatter_nd_update_reduction_##name0##_F16( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + image2d_t temp_buf_float, \ + image2d_t link_buffer0, \ + int val0, int val1, int val2, int val3, int val4, int val5, int val6, \ + int coord_dim, int update_width, int output_width, float update_scale, float zp_scale) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(update, 2); \ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global short* update_ptr = (__global short*)img2.ptr; \ + __global float* output_ptr = (__global float*)img3.ptr; \ + half src; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + short tmpData = update_ptr[gidy * update_width + gidx]; \ + int idx = indice.x * val0 + indice.y * val1 + indice.z * val2 + indice.w * val3; \ + idx = idx + indice1.x * val4 + indice1.y * val5 + indice1.z * val6; \ + int loc = idx * output_width + gidx; \ + _viv_asm(COPY, src, tmpData, 4); \ + float data; \ + _viv_asm(CONV, data, src); \ + func(output_ptr + loc, data); \ +} +SCATTER_ND_REDUCTION_PROCESS_F16(Add, AtomicAdd_float) +SCATTER_ND_REDUCTION_PROCESS_F16(Mul, AtomicMul_float) +SCATTER_ND_REDUCTION_PROCESS_F16(Max, AtomicMax_float) +SCATTER_ND_REDUCTION_PROCESS_F16(Min, AtomicMin_float) + +#define SCATTER_ND_UPDATE_PROCESS_QINT(name0, src0_type, ptr_type, element_size, func) \ +__kernel void scatter_nd_update_reduction_##name0##_##src0_type( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + image2d_t temp_buf_float, \ + image2d_t link_buffer0, \ + int val0, int val1, int val2, int val3, int val4, int val5, int val6, \ + int coord_dim, int update_width, int output_width, float update_scale, float zp_scale) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(update, element_size); \ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \ + __global float* output_ptr = (__global float*)img3.ptr; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \ + int idx = indice.x * val0 + indice.y * val1 + indice.z * val2 + indice.w * val3; \ + idx = idx + indice1.x * val4 + indice1.y * val5 + indice1.z * val6; \ + int loc = idx * output_width + gidx; \ + float data; \ + _viv_asm(CONV, data, tmpData); \ + data = data * update_scale + zp_scale; \ + func(output_ptr + loc, data); \ +} +SCATTER_ND_UPDATE_PROCESS_QINT(Add, U8, uchar, 1, AtomicAdd_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, U8, uchar, 1, AtomicMul_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Max, U8, uchar, 1, AtomicMax_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Min, U8, uchar, 1, AtomicMin_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Add, I8, char, 1, AtomicAdd_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I8, char, 1, AtomicMul_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Max, I8, char, 1, AtomicMax_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Min, I8, char, 1, AtomicMin_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Add, I16, short, 2, AtomicAdd_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I16, short, 2, AtomicMul_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Max, I16, short, 2, AtomicMax_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Min, I16, short, 2, AtomicMin_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Add, F32, float, 4, AtomicAdd_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, F32, float, 4, AtomicMul_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Max, F32, float, 4, AtomicMax_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Min, F32, float, 4, AtomicMin_float) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction_conv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction_conv.cl new file mode 100644 index 0000000..80e07f7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction_conv.cl @@ -0,0 +1,72 @@ +__kernel void scatter_nd_update_reduction_conv_F16( + __read_only image2d_t temp_buf_float, + __read_only image2d_t link_buf, + image2d_t output, + int length, int res, float output_scale, float output_zp) +{ + int gidx = get_global_id(0); + Image img1 = create_image_from_image2d(temp_buf_float, 4); + Image img2 = create_image_from_image2d(output, 2); + __global float* input_ptr = (__global float*)img1.ptr; + __global short* output_ptr = (__global short*)img2.ptr; + if(length > 0) + { + int offset = gidx * 8; + float4 src0 = vload4(0, input_ptr + offset); + float4 src1 = vload4(1, input_ptr + offset); + half4 data0, data1; + _viv_asm(CONV, data0, src0); + _viv_asm(CONV, data1, src1); + short4 dst0, dst1; + _viv_asm(COPY, dst0, data0, 16); + _viv_asm(COPY, dst1, data1, 16); + vstore4(dst0, 0, output_ptr + offset); + vstore4(dst1, 1, output_ptr + offset); + } + for(int i = gidx; i < res; i += get_global_size(0)) + { + float src = input_ptr[length + i]; + half data; + _viv_asm(CONV, data, src); + short dst; + _viv_asm(COPY, dst, data, 4); + output_ptr[length + i] = dst; + } +} + +#define SCATTER_ND_UPDATE_CONV(src0_type, ptr_type, element_size, ptr_type1, conv_func) \ +__kernel void scatter_nd_update_reduction_conv_##src0_type( \ + __read_only image2d_t temp_buf_float, \ + __read_only image2d_t link_buf, \ + image2d_t output, \ + int length, int res, float output_scale, float output_zp) \ +{ \ + int gidx = get_global_id(0); \ + Image img1 = create_image_from_image2d(temp_buf_float, 4); \ + Image img2 = create_image_from_image2d(output, element_size); \ + __global float* input_ptr = (__global float*)img1.ptr; \ + __global ptr_type1* output_ptr = (__global ptr_type1*)img2.ptr; \ + if(length > 0) \ + { \ + int offset = gidx * 8; \ + float4 src0 = vload4(0, input_ptr + offset); \ + float4 src1 = vload4(1, input_ptr + offset); \ + int4 data0 = convert_int4_rte(src0 * output_scale + output_zp); \ + int4 data1 = convert_int4_rte(src1 * output_scale + output_zp); \ + ptr_type dst0, dst1; \ + _viv_asm(CONV, dst0, data0); \ + _viv_asm(CONV, dst1, data1); \ + vstore4(dst0, 0, output_ptr + offset); \ + vstore4(dst1, 1, output_ptr + offset); \ + } \ + for(int i = gidx; i < res; i += get_global_size(0)) \ + { \ + float src = input_ptr[length + i]; \ + int data = convert_int_rte(src * output_scale + output_zp); \ + output_ptr[length + i] = conv_func(data); \ + } \ +} +SCATTER_ND_UPDATE_CONV(U8, uchar4, 1, uchar, convert_uchar) +SCATTER_ND_UPDATE_CONV(I8, char4, 1, char, convert_char) +SCATTER_ND_UPDATE_CONV(I16, short4, 2, short, convert_short) +SCATTER_ND_UPDATE_CONV(F32, float4, 4, float, convert_float) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl b/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl index 95254d2..d457a36 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl @@ -121,7 +121,7 @@ __kernel void swish_I32toI32_2D( src = read_imagef(input, coord); \ tmp.x = sigmoid_(src.x * beta, logE); \ data.x = src.x * tmp.x; \ - uint4 dst = convert_uint4(data * outputScale + outputZP); \ + uint4 dst = convert_uint4_rte(data * outputScale + outputZP); \ write_imageui(output, coord, dst); __kernel void swish_F32toU8( diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_bilinear.vx b/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_bilinear.vx new file mode 100644 index 0000000..5e126ec --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_bilinear.vx @@ -0,0 +1,255 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable + +#include "cl_viv_vx_ext.h" + +_viv_uniform float inOutScale; +_viv_uniform float inOutTile; +_viv_uniform float width_scale; +_viv_uniform float height_scale; +_viv_uniform int image_width; +_viv_uniform int image_height; +_viv_uniform VXC_512Bits uniRightToFp32_4x4; +_viv_uniform VXC_512Bits uniLeftToFp32_4x4; +_viv_uniform VXC_512Bits uniExtract8Bit_2x8; +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; + +#define CROP_AND_RESIZE_PART0 \ + int i = 0; \ + int bb = get_global_id(2); \ + int y = get_global_id(1); \ + int4 x = (int4)(get_global_id(0),get_global_id(0) + 1, get_global_id(0) + 2, get_global_id(0) + 3); \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int2 coord_box_ind = (int2)(bb, 0); \ + int b = read_imagei(box_ind, coord_box_ind).x; \ + float4 xy, in_x; \ + float in_y; \ + float4 x_lerp, y_lerp; \ + int d = 0; \ + Image img_boxes = create_image_from_image2d(boxes, 2); \ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \ + xy = vload_half4(bb, boxes_ptr); \ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \ + \ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \ + in_y = xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale; \ + y_lerp.x = in_y - floor(in_y); \ + y_lerp.yzw = y_lerp.xxx; + +#define CROP_AND_RESIZE_PART1 \ + int4 coord = (int4)(0, in_y, d + b * ori_depth, 0); \ + int8 input_desc, output_desc; \ + \ + coord_out.z = d + coord_out.z * ori_depth; \ + \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.w, baseAddr); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + in_x.x = xy.y * convert_float(image_width - 1); \ + in_x.yzw = in_x.xxx; \ + in_x = in_x + convert_float4(x) * _width_scale; \ + x_lerp = in_x - floor(in_x); \ + coord.x = floor(in_x.x); \ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord.x = floor(in_x.y); \ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord.x = floor(in_x.z); \ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord.x = floor(in_x.w); \ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + +#define CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(name,src_type,dst_type) \ +__kernel void crop_and_resize_bilinear_##name \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout \ +) \ +{ \ + CROP_AND_RESIZE_PART0; \ + for (d = 0; d < ori_depth; d++) \ + { \ + src_type src0, src1; \ + CROP_AND_RESIZE_PART1; \ + \ + float4 top, bottom, value; \ + float4 top_left4,top_right4,bottom_left4,bottom_right4; \ + dst_type data; \ + int4 tmpout; \ + \ + VXC_DP4x4(top_left4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \ + VXC_DP4x4(top_right4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \ + VXC_DP4x4(bottom_left4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \ + VXC_DP4x4(bottom_right4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \ + \ + top = top_left4 + (top_right4 - top_left4) * x_lerp; \ + bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \ + value = top + (bottom - top) * y_lerp; \ + value = value * inOutScale + inOutTile; \ + _viv_asm(CONV, tmpout, value); \ + VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(U8toU8,vxc_uchar8, vxc_uchar4) +CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(I8toI8,vxc_char8, vxc_char4) + +#define CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(name,src_type,dst_type,tmp_type) \ +__kernel void crop_and_resize_bilinear_##name \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout \ +) \ +{ \ + CROP_AND_RESIZE_PART0; \ + for (d = 0; d < ori_depth; d++) \ + { \ + vxc_short8 src0, src1; \ + src_type src0_temp, src1_temp; \ + CROP_AND_RESIZE_PART1; \ + \ + _viv_asm(COPY, src0_temp, src0, 16); \ + _viv_asm(COPY, src1_temp, src1, 16); \ + float4 top, bottom, value; \ + float4 top_left4,top_right4,bottom_left4,bottom_right4; \ + dst_type data; \ + vxc_short4 out; \ + tmp_type tmpout; \ + \ + VXC_DP4x4(top_left4, src0_temp, src0_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \ + VXC_DP4x4(top_right4, src0_temp, src0_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \ + VXC_DP4x4(bottom_left4, src1_temp, src1_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \ + VXC_DP4x4(bottom_right4, src1_temp, src1_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \ + \ + top = top_left4 + (top_right4 - top_left4) * x_lerp; \ + bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \ + value = top + (bottom - top) * y_lerp; \ + value = value * inOutScale + inOutTile; \ + _viv_asm(CONV, tmpout, value); \ + VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \ + _viv_asm(COPY, out, data, 8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(I16toI16, vxc_short8, vxc_short4, short4) +CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(I16toF16, vxc_short8, vxc_half4, half4) +CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(F16toF16, vxc_half8, vxc_half4, half4) +CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(F16toI16, vxc_half8, vxc_short4, short4) + +#define CROP_AND_RESIZE_BILINEAR_F16toQuant8(name,dst_type) \ +__kernel void crop_and_resize_bilinear_F16to##name \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout \ +) \ +{ \ + CROP_AND_RESIZE_PART0; \ + for (d = 0; d < ori_depth; d++) \ + { \ + vxc_short8 src0, src1; \ + vxc_half8 src0_temp, src1_temp; \ + CROP_AND_RESIZE_PART1; \ + \ + _viv_asm(COPY, src0_temp, src0, 16); \ + _viv_asm(COPY, src1_temp, src1, 16); \ + float4 top, bottom, value; \ + float4 top_left4,top_right4,bottom_left4,bottom_right4; \ + dst_type data; \ + int4 tmpout; \ + \ + VXC_DP4x4(top_left4, src0_temp, src0_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \ + VXC_DP4x4(top_right4, src0_temp, src0_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \ + VXC_DP4x4(bottom_left4, src1_temp, src1_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \ + VXC_DP4x4(bottom_right4, src1_temp, src1_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \ + \ + top = top_left4 + (top_right4 - top_left4) * x_lerp; \ + bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \ + value = top + (bottom - top) * y_lerp; \ + value = value * inOutScale + inOutTile; \ + _viv_asm(CONV, tmpout, value); \ + VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CROP_AND_RESIZE_BILINEAR_F16toQuant8(U8, vxc_uchar4) +CROP_AND_RESIZE_BILINEAR_F16toQuant8(I8, vxc_char4) + +#define CROP_AND_RESIZE_BILINEAR_Quant8toF16(name,src_type) \ +__kernel void crop_and_resize_bilinear_##name##toF16 \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout \ +) \ +{ \ + CROP_AND_RESIZE_PART0; \ + for (d = 0; d < ori_depth; d++) \ + { \ + src_type src0, src1; \ + CROP_AND_RESIZE_PART1; \ + \ + float4 top, bottom, value; \ + float4 top_left4,top_right4,bottom_left4,bottom_right4; \ + vxc_half4 data; \ + vxc_short4 out; \ + half4 tmpout; \ + \ + VXC_DP4x4(top_left4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \ + VXC_DP4x4(top_right4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \ + VXC_DP4x4(bottom_left4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \ + VXC_DP4x4(bottom_right4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \ + \ + top = top_left4 + (top_right4 - top_left4) * x_lerp; \ + bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \ + value = top + (bottom - top) * y_lerp; \ + value = value * inOutScale + inOutTile; \ + _viv_asm(CONV, tmpout, value); \ + VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \ + _viv_asm(COPY, out, data, 8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CROP_AND_RESIZE_BILINEAR_Quant8toF16(U8, vxc_uchar8) +CROP_AND_RESIZE_BILINEAR_Quant8toF16(I8, vxc_char8) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_nearest_neighbor.vx b/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_nearest_neighbor.vx new file mode 100644 index 0000000..b67890f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_nearest_neighbor.vx @@ -0,0 +1,292 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable + +#include "cl_viv_vx_ext.h" + +_viv_uniform float inOutScale; +_viv_uniform float inOutTile; +_viv_uniform float width_scale; +_viv_uniform float height_scale; +_viv_uniform int image_width; +_viv_uniform int image_height; +_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4; +_viv_uniform VXC_512Bits uniExtract8Bit_2x8; +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; + +#define IMG_LOAD(src_type) \ + src_type src; \ + int4 coord = (int4)(0, in_y, d + b * ori_depth, 0); \ + int8 input_desc, output_desc; \ + \ + coord_out.z = d + coord_out.z * ori_depth; \ + \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.w, baseAddr); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x) * _width_scale)); \ + coord.x = in_x; \ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 1) * _width_scale)); \ + coord.x = in_x; \ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 2) * _width_scale)); \ + coord.x = in_x; \ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 3) * _width_scale)); \ + coord.x = in_x; \ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 4) * _width_scale)); \ + coord.x = in_x; \ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 5) * _width_scale)); \ + coord.x = in_x; \ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 6) * _width_scale)); \ + coord.x = in_x; \ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 7) * _width_scale)); \ + coord.x = in_x; \ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \ + +#define CROP_AND_RESIZE_Quant8toQuant8(name, data_type) \ +__kernel void crop_and_resize_nearest_neighbor_##name \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout \ +) \ +{ \ + int bb = get_global_id(2); \ + int y = get_global_id(1); \ + int x = get_global_id(0); \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int2 coord_box_ind = (int2)(bb, 0); \ + int b = read_imagei(box_ind, coord_box_ind).x; \ + float4 xy; \ + int in_x, in_y; \ + int d = 0; \ + Image img_boxes = create_image_from_image2d(boxes, 2); \ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \ + xy = vload_half4(bb, boxes_ptr); \ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \ + \ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \ + \ + for (d = 0; d < ori_depth; d++) \ + { \ + data_type data; \ + int4 tmpout0, tmpout1; \ + float4 tmpdata0, tmpdata1; \ + IMG_LOAD(data_type); \ + \ + VXC_DP4x4(tmpdata0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \ + VXC_DP4x4(tmpdata1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \ + \ + tmpdata0 = tmpdata0 * inOutScale + inOutTile; \ + tmpdata1 = tmpdata1 * inOutScale + inOutTile; \ + _viv_asm(CONV, tmpout0, tmpdata0); \ + _viv_asm(CONV, tmpout1, tmpdata1); \ + \ + VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \ + \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CROP_AND_RESIZE_Quant8toQuant8(U8toU8, vxc_uchar8) +CROP_AND_RESIZE_Quant8toQuant8(I8toI8, vxc_char8) + +#define CROP_AND_RESIZE_Quant8toF16(name, src_type) \ +__kernel void crop_and_resize_nearest_neighbor_##name##toF16 \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout \ +) \ +{ \ + int bb = get_global_id(2); \ + int y = get_global_id(1); \ + int x = get_global_id(0); \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int2 coord_box_ind = (int2)(bb, 0); \ + int b = read_imagei(box_ind, coord_box_ind).x; \ + float4 xy; \ + int in_x, in_y; \ + int d = 0; \ + Image img_boxes = create_image_from_image2d(boxes, 2); \ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \ + xy = vload_half4(bb, boxes_ptr); \ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \ + \ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \ + \ + for (d = 0; d < ori_depth; d++) \ + { \ + vxc_short8 out; \ + vxc_half8 data; \ + half4 tmpout0, tmpout1; \ + float4 tmpdata0, tmpdata1; \ + IMG_LOAD(src_type); \ + \ + VXC_DP4x4(tmpdata0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \ + VXC_DP4x4(tmpdata1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \ + \ + tmpdata0 = tmpdata0 * inOutScale + inOutTile; \ + tmpdata1 = tmpdata1 * inOutScale + inOutTile; \ + _viv_asm(CONV, tmpout0, tmpdata0); \ + _viv_asm(CONV, tmpout1, tmpdata1); \ + \ + VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \ + _viv_asm(COPY, out, data, 16); \ + \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CROP_AND_RESIZE_Quant8toF16(U8, vxc_uchar8) +CROP_AND_RESIZE_Quant8toF16(I8, vxc_char8) + +#define CROP_AND_RESIZE_NEAREST_F16toQuant8(name, dst_type) \ +__kernel void crop_and_resize_nearest_neighbor_F16to##name \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout \ +) \ +{ \ + int bb = get_global_id(2); \ + int y = get_global_id(1); \ + int x = get_global_id(0); \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int2 coord_box_ind = (int2)(bb, 0); \ + int b = read_imagei(box_ind, coord_box_ind).x; \ + float4 xy; \ + int in_x, in_y; \ + int d = 0; \ + Image img_boxes = create_image_from_image2d(boxes, 2); \ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \ + xy = vload_half4(bb, boxes_ptr); \ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \ + \ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \ + \ + for (d = 0; d < ori_depth; d++) \ + { \ + dst_type data; \ + int4 tmpout0, tmpout1; \ + float4 tmpdata0, tmpdata1; \ + IMG_LOAD(vxc_short8); \ + vxc_half8 src_half; \ + _viv_asm(COPY, src_half, src, 16); \ + \ + VXC_DP4x4(tmpdata0, src_half, src_half, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \ + VXC_DP4x4(tmpdata1, src_half, src_half, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \ + \ + tmpdata0 = tmpdata0 * inOutScale + inOutTile; \ + tmpdata1 = tmpdata1 * inOutScale + inOutTile; \ + _viv_asm(CONV, tmpout0, tmpdata0); \ + _viv_asm(CONV, tmpout1, tmpdata1); \ + \ + VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \ + \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CROP_AND_RESIZE_NEAREST_F16toQuant8(U8, vxc_uchar8) +CROP_AND_RESIZE_NEAREST_F16toQuant8(I8, vxc_char8) + +#define CROP_AND_RESIZE_16Bitsto16Bits(name,src_type,dst_type,temp_type) \ +__kernel void crop_and_resize_nearest_neighbor_##name \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t boxes, \ + __read_only image2d_t box_ind, \ + __write_only image2d_array_t output, \ + uint ori_depth, \ + uint ori_batchout \ +) \ +{ \ + int bb = get_global_id(2); \ + int y = get_global_id(1); \ + int x = get_global_id(0); \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int2 coord_box_ind = (int2)(bb, 0); \ + int b = read_imagei(box_ind, coord_box_ind).x; \ + float4 xy; \ + int in_x, in_y; \ + int d = 0; \ + Image img_boxes = create_image_from_image2d(boxes, 2); \ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \ + xy = vload_half4(bb, boxes_ptr); \ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \ + \ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \ + \ + for (d = 0; d < ori_depth; d++) \ + { \ + vxc_short8 out; \ + dst_type data; \ + temp_type tmpout0, tmpout1; \ + float4 tmpdata0, tmpdata1; \ + IMG_LOAD(vxc_short8); \ + src_type src_temp; \ + _viv_asm(COPY, src_temp, src, 16); \ + \ + VXC_DP4x4(tmpdata0, src_temp, src_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \ + VXC_DP4x4(tmpdata1, src_temp, src_temp, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \ + \ + _viv_asm(CONV, tmpout0, tmpdata0); \ + _viv_asm(CONV, tmpout1, tmpdata1); \ + \ + VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \ + _viv_asm(COPY, out, data, 16); \ + \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CROP_AND_RESIZE_16Bitsto16Bits \ +(F16toF16, vxc_half8, vxc_half8, half4) +CROP_AND_RESIZE_16Bitsto16Bits \ +(F16toI16, vxc_half8, vxc_short8, short4) +CROP_AND_RESIZE_16Bitsto16Bits \ +(I16toF16, vxc_short8, vxc_half8, half4) +CROP_AND_RESIZE_16Bitsto16Bits \ +(I16toI16, vxc_short8, vxc_short8,short4) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx index 9a6a9fe..00d13ce 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx @@ -81,6 +81,11 @@ float4 eltwise_unary_acosh(float4 val) return acosh(val); } +float4 eltwise_unary_tan(float4 val) +{ + return native_tan(val); +} + _viv_uniform float inputScale; _viv_uniform float inputTail; _viv_uniform float outputScale; @@ -198,4 +203,5 @@ ADD_ELTSISE_UNARY_2D(atan) ADD_ELTSISE_UNARY_2D(atanh) //ACOSH ADD_ELTSISE_UNARY_2D(acosh) - +//TAN +ADD_ELTSISE_UNARY_2D(tan) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx index f53c3ff..f8de5fa 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx @@ -81,6 +81,11 @@ float4 eltwise_unary_acosh(float4 val) return acosh(val); } +float4 eltwise_unary_tan(float4 val) +{ + return native_tan(val); +} + _viv_uniform float inputScale; _viv_uniform float inputTail; _viv_uniform float outputScale; @@ -197,3 +202,5 @@ ADD_ELTSISE_UNARY_3D(atan) ADD_ELTSISE_UNARY_3D(atanh) //ACOSH ADD_ELTSISE_UNARY_3D(acosh) +//TAN +ADD_ELTSISE_UNARY_3D(tan) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx index 73171a8..7ba4fc1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx @@ -9,7 +9,8 @@ __kernel void gather_I8toI8( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -34,7 +35,8 @@ __kernel void gather_U8toU8( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -59,7 +61,8 @@ __kernel void gather_I16toI16( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -85,7 +88,8 @@ __kernel void gather_F16toF16( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -110,7 +114,8 @@ __kernel void gather_I8toI8_axis0( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); @@ -137,7 +142,8 @@ __kernel void gather_U8toU8_axis0( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); @@ -164,7 +170,8 @@ __kernel void gather_I16toI16_axis0( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); @@ -191,7 +198,8 @@ __kernel void gather_F16toF16_axis0( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx index b2009bf..b7729d0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx @@ -11,7 +11,8 @@ __kernel void gather_I8toI8_array( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -25,13 +26,29 @@ __kernel void gather_I8toI8_array( Image img1 = create_image_from_image2d(input0, 1); Image img2 = create_image_from_image2d(output, 1); - uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); - __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr; - vxc_char16 src = data_ptr[0]; + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); uchar* output_ptr = get_image_ptr_from_coord(img2, coord); - __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr; - dst_ptr[0] = src; + + if (gidx == ((block_size >> 4) * 16)) + { + __global char* data_ptr = (__global char*)input_ptr; + __global char* dst_ptr = (__global char*)output_ptr; + int i = 0; + for (i = 0; i < block_size - gidx; i ++) + { + dst_ptr[i] = data_ptr[i]; + } + } + else + { + __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr; + vxc_char16 src = data_ptr[0]; + __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr; + dst_ptr[0] = src; + } } __kernel void gather_U8toU8_array( @@ -40,7 +57,8 @@ __kernel void gather_U8toU8_array( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -54,13 +72,29 @@ __kernel void gather_U8toU8_array( Image img1 = create_image_from_image2d(input0, 1); Image img2 = create_image_from_image2d(output, 1); - uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); - __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr; - vxc_uchar16 src = data_ptr[0]; + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); uchar* output_ptr = get_image_ptr_from_coord(img2, coord); - __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr; - dst_ptr[0] = src; + + if (gidx == ((block_size >> 4) * 16)) + { + __global uchar* data_ptr = (__global uchar*)input_ptr; + __global uchar* dst_ptr = (__global uchar*)output_ptr; + int i = 0; + for (i = 0; i < block_size - gidx; i ++) + { + dst_ptr[i] = data_ptr[i]; + } + } + else + { + __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr; + vxc_uchar16 src = data_ptr[0]; + __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr; + dst_ptr[0] = src; + } } __kernel void gather_I16toI16_array( @@ -69,7 +103,8 @@ __kernel void gather_I16toI16_array( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -84,13 +119,29 @@ __kernel void gather_I16toI16_array( Image img1 = create_image_from_image2d(input0, 2); Image img2 = create_image_from_image2d(output, 2); - uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); - __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; - vxc_short8 src = data_ptr[0]; + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); uchar* output_ptr = get_image_ptr_from_coord(img2, coord); - __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr; - dst_ptr[0] = src; + + if (gidx == ((block_size >> 3) * 8)) + { + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + int i = 0; + for (i = 0; i < block_size - gidx; i ++) + { + dst_ptr[i] = data_ptr[i]; + } + } + else + { + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; + vxc_short8 src = data_ptr[0]; + __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr; + dst_ptr[0] = src; + } } __kernel void gather_F16toF16_array( @@ -99,7 +150,8 @@ __kernel void gather_F16toF16_array( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -114,13 +166,29 @@ __kernel void gather_F16toF16_array( Image img1 = create_image_from_image2d(input0, 2); Image img2 = create_image_from_image2d(output, 2); - uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); - __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; - vxc_short8 src = data_ptr[0]; + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); uchar* output_ptr = get_image_ptr_from_coord(img2, coord); - __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr; - dst_ptr[0] = src; + + if (gidx == ((block_size >> 3) * 8)) + { + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + int i = 0; + for (i = 0; i < block_size - gidx; i ++) + { + dst_ptr[i] = data_ptr[i]; + } + } + else + { + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; + vxc_short8 src = data_ptr[0]; + __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr; + dst_ptr[0] = src; + } } #define GATHER_AXIS0_ARRAY(src0_type_name, read_type, data_type, write_type) \ @@ -130,7 +198,8 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \ __write_only image2d_t output, \ int block_size, \ int block_num, \ - int axis_num \ + int axis_num, \ + int is_array \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx index 47f1db6..4bc39f0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx @@ -10,7 +10,8 @@ __kernel void gather_batch_I8toI8( __write_only image2d_array_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -41,7 +42,8 @@ __kernel void gather_batch_U8toU8( __write_only image2d_array_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -72,7 +74,8 @@ __kernel void gather_batch_I16toI16( __write_only image2d_array_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -103,7 +106,8 @@ __kernel void gather_batch_F16toF16( __write_only image2d_array_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); // block_size @@ -134,7 +138,8 @@ __kernel void gather_batch_I8toI8_axis0( __write_only image2d_array_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); @@ -163,7 +168,8 @@ __kernel void gather_batch_U8toU8_axis0( __write_only image2d_array_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); @@ -192,7 +198,8 @@ __kernel void gather_batch_I16toI16_axis0( __write_only image2d_array_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); @@ -221,7 +228,8 @@ __kernel void gather_batch_F16toF16_axis0( __write_only image2d_array_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx index 87825fd..bbe29e7 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx @@ -15,7 +15,8 @@ __kernel void gather_##src0_type_name##toF16( \ __write_only image2d_t output, \ int block_size, \ int block_num, \ - int axis_num \ + int axis_num, \ + int is_array \ ) \ { \ int gidx = get_global_id(0); \ @@ -52,7 +53,8 @@ __kernel void gather_F16to##src1_type_name( \ __write_only image2d_t output, \ int block_size, \ int block_num, \ - int axis_num \ + int axis_num, \ + int is_array \ ) \ { \ int gidx = get_global_id(0); \ @@ -85,7 +87,8 @@ __kernel void gather_I16toF16( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); @@ -120,7 +123,8 @@ __kernel void gather_##src0_type_name##toF16_axis0( \ __write_only image2d_t output, \ int block_size, \ int block_num, \ - int axis_num \ + int axis_num, \ + int is_array \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ @@ -152,7 +156,8 @@ __kernel void gather_F16to##src1_type_name##_axis0( \ __write_only image2d_t output, \ int block_size, \ int block_num, \ - int axis_num \ + int axis_num, \ + int is_array \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ @@ -184,7 +189,8 @@ __kernel void gather_I16toF16_axis0( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx index 988c811..e68d0a1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx @@ -16,7 +16,8 @@ __kernel void gather_batch_##src0_type_name##toF16( \ __write_only image2d_t output, \ int block_size, \ int block_num, \ - int axis_num \ + int axis_num, \ + int is_array \ ) \ { \ int gidx = get_global_id(0); \ @@ -63,7 +64,8 @@ __kernel void gather_batch_F16to##src1_type_name( \ __write_only image2d_t output, \ int block_size, \ int block_num, \ - int axis_num \ + int axis_num, \ + int is_array \ ) \ { \ int gidx = get_global_id(0); \ @@ -104,7 +106,8 @@ __kernel void gather_batch_I16toF16( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int gidx = get_global_id(0); @@ -143,7 +146,8 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \ __write_only image2d_t output, \ int block_size, \ int block_num, \ - int axis_num \ + int axis_num, \ + int is_array \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ @@ -178,7 +182,8 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \ __write_only image2d_t output, \ int block_size, \ int block_num, \ - int axis_num \ + int axis_num, \ + int is_array \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ @@ -213,7 +218,8 @@ __kernel void gather_batch_I16toF16_axis0( __write_only image2d_t output, int block_size, int block_num, - int axis_num + int axis_num, + int is_array ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx index 6d117ed..00f6511 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx @@ -6,6 +6,9 @@ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; _viv_uniform VXC_512Bits uniExtractOddData_2x8; +_viv_uniform float output_scale1; +_viv_uniform float output_zp1; + float4 sigmoid_func(float4 x) { x *= -logE; @@ -117,13 +120,15 @@ __kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \ VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ h_tm = h_tm * hstate_in_scale + hstate_in_tail; \ float4 result = (1 - z) * h + z * h_tm; \ - result = result * output_scale + output_zp; \ - int4 dst0; \ - _viv_asm(CONV_RTE, dst0, result); \ + float4 out0 = result * output_scale + output_zp; \ + float4 out1 = result * output_scale1 + output_zp1; \ + int4 dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, out0); \ + _viv_asm(CONV_RTE, dst1, out1); \ dst_type dst; \ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(hstate_out, coord_in, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8) GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx index dbd265a..854bc1e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx @@ -25,6 +25,11 @@ float4 tanh_func(float4 x) x = 1.0f / x; return 2 * x - 1; } +float4 relu_func(float4 x) +{ + x = x > 0 ? x : 0; + return x; +} _viv_uniform VXC_512Bits uniF16PlusF16_0_4x4; _viv_uniform VXC_512Bits uniF16PlusF16_1_4x4; @@ -88,6 +93,8 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act } GRUCELL_F16_F16TOF16(TANH, tanh_func, SIGMOID, sigmoid_func) GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func) +GRUCELL_F16_F16TOF16(RELU, relu_func, SIGMOID, sigmoid_func) + _viv_uniform float hstate_in_scale; _viv_uniform float hstate_in_tail; @@ -153,6 +160,10 @@ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func, GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8) GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8) GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8) +GRUCELL_QNT_F16TO_QNT(U8_F16toU8_RELU_SIGMOID, relu_func, sigmoid_func, vxc_uchar8, vxc_uchar8) +GRUCELL_QNT_F16TO_QNT(I8_F16toI8_RELU_SIGMOID, relu_func, sigmoid_func, vxc_char8, vxc_char8) +GRUCELL_QNT_F16TO_QNT(I16_F16toI16_RELU_SIGMOID, relu_func, sigmoid_func, vxc_short8, vxc_short8) + #define GRUCELL_BF16(act_name, act_func, rec_act_name, rec_act_func) \ __kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_act_name( \ @@ -215,3 +226,4 @@ __kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_ } GRUCELL_BF16(TANH, tanh_func, SIGMOID, sigmoid_func) GRUCELL_BF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func) +GRUCELL_BF16(RELU, relu_func, SIGMOID, sigmoid_func) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_0.vx new file mode 100644 index 0000000..9046891 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_0.vx @@ -0,0 +1,315 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform int height; +_viv_uniform uint group_num; +_viv_uniform float output_zp; +_viv_uniform float output_scale; +_viv_uniform float inv_multiplier; + + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F16toF16( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + vxc_uchar16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= inv_multiplier; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + coord_para.z = 0; + coord_para.w = 0; + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); + + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; + _viv_asm(MOV, coord_para.w, baseAddr_c); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1, norm; + half4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord_in, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; + coord_para.y = coord.y; coord_bias.y = coord.y; + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + tmpData0 = tmpData0 - mean_vari.s0; + tmpData1 = tmpData1 - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniExtract8Data_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F32toF16( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + vxc_uchar16 src0; + vxc_short8 outval; + vxc_half8 scale_h, dst; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= inv_multiplier; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + coord_para.z = 0; + coord_para.w = 0; + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1, norm; + half4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord_in, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; + coord_bias.y = coord.y; + + bias_f0 = read_imagef(bias, coord_bias); + scale_f0 = read_imagef(scale, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + scale_f1 = read_imagef(scale, coord_bias); + coord_bias.x = coord.x; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + tmpData0 = tmpData0 - mean_vari.s0; + tmpData1 = tmpData1 - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniExtract8Data_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F16toU8( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + vxc_uchar16 src0, outval; + vxc_short8 src1; + vxc_half8 scale_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= inv_multiplier; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + coord_para.z = 0; + coord_para.w = 0; + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); + + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; + _viv_asm(MOV, coord_para.w, baseAddr_c); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1, norm; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord_in, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; + coord_para.y = coord.y; coord_bias.y = coord.y; + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + tmpData0 = tmpData0 - mean_vari.s0; + tmpData1 = tmpData1 - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); + + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniExtract8Data_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F32toU8( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + vxc_uchar16 src0 , outval; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= inv_multiplier; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + coord_para.z = 0; + coord_para.w = 0; + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1, norm; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord_in, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; + coord_bias.y = coord.y; + + bias_f0 = read_imagef(bias, coord_bias); + scale_f0 = read_imagef(scale, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + scale_f1 = read_imagef(scale, coord_bias); + coord_bias.x = coord.x; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + tmpData0 = tmpData0 - mean_vari.s0; + tmpData1 = tmpData1 - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); + + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniExtract8Data_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_1.vx new file mode 100644 index 0000000..b247b2f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_1.vx @@ -0,0 +1,317 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform int height; +_viv_uniform uint group_num; +_viv_uniform float output_zp; +_viv_uniform float output_scale; +_viv_uniform float inv_multiplier; + + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F16toF16( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + vxc_char16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= inv_multiplier; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + coord_para.z = 0; + coord_para.w = 0; + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); + + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; + _viv_asm(MOV, coord_para.w, baseAddr_c); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1, norm; + half4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord_in, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; + coord_para.y = coord.y; coord_bias.y = coord.y; + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + tmpData0 = tmpData0 - mean_vari.s0; + tmpData1 = tmpData1 - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniExtract8Data_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F32toF16( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + vxc_char16 src0; + vxc_short8 outval; + vxc_half8 scale_h, dst; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= inv_multiplier; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + coord_para.z = 0; + coord_para.w = 0; + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1, norm; + half4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord_in, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; + coord_bias.y = coord.y; + + bias_f0 = read_imagef(bias, coord_bias); + scale_f0 = read_imagef(scale, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + scale_f1 = read_imagef(scale, coord_bias); + coord_bias.x = coord.x; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + tmpData0 = tmpData0 - mean_vari.s0; + tmpData1 = tmpData1 - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + _viv_asm(CONV, tmpVal1, norm); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniExtract8Data_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F16toU8( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + vxc_char16 src0; + vxc_uchar16 outval; + vxc_short8 src1; + vxc_half8 scale_h; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= inv_multiplier; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + coord_para.z = 0; + coord_para.w = 0; + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); + + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; + _viv_asm(MOV, coord_para.w, baseAddr_c); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1, norm; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord_in, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; + coord_para.y = coord.y; coord_bias.y = coord.y; + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = read_imagef(bias, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + coord_bias.x = coord.x; + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + tmpData0 = tmpData0 - mean_vari.s0; + tmpData1 = tmpData1 - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); + + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniExtract8Data_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F32toU8( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); + int2 coord_sum = (int2)(0, gidz); + int4 coord_para = coord; + vxc_char16 src0; + vxc_uchar16 outval; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_float4 mean_vari = (vxc_float4)(0); + + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_sum); + coord_sum.x += 4; + } + mean_vari *= inv_multiplier; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + coord_para.z = 0; + coord_para.w = 0; + int4 coord_bias = coord_para; + + int8 input_desc, scale_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + + vxc_float4 tmpData0, tmpData1, norm; + vxc_int4 tmpVal0, tmpVal1; + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord_in, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y ++; + coord_bias.y = coord.y; + + bias_f0 = read_imagef(bias, coord_bias); + scale_f0 = read_imagef(scale, coord_bias); + coord_bias.x += 4; + bias_f1 = read_imagef(bias, coord_bias); + scale_f1 = read_imagef(scale, coord_bias); + coord_bias.x = coord.x; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); + tmpData0 = tmpData0 - mean_vari.s0; + tmpData1 = tmpData1 - mean_vari.s0; + + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); + + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniExtract8Data_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_2.vx new file mode 100644 index 0000000..4d35266 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_2.vx @@ -0,0 +1,348 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform int height; +_viv_uniform uint group_num; +_viv_uniform float output_zp; +_viv_uniform float output_scale; +_viv_uniform float inv_multiplier; + +#define LAYER_NORM_AXIS01_F16_F16to16Bits(name,temp_type,dst_type,conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F16to##name( \ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \ + image2d_array_t output, float eps) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ + int2 coord_sum = (int2)(0, gidz); \ + int4 coord_para = coord; \ + vxc_short8 src0; \ + vxc_short8 src1; \ + vxc_half8 scale_h, in_h; \ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + vxc_float4 mean_vari = (vxc_float4)(0); \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_sum); \ + coord_sum.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + coord_para.z = 0; \ + coord_para.w = 0; \ + int4 coord_bias = coord_para; \ + \ + int8 input_desc, scale_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.z, baseAddr_a); \ + \ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \ + _viv_asm(MOV, coord_para.w, baseAddr_c); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr); \ + \ + vxc_float4 tmpData0, tmpData1; \ + vxc_short8 outval; \ + temp_type tmpVal0, tmpVal1; \ + dst_type dst; \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.y ++; \ + coord_para.y = coord.y; \ + coord_bias.y = coord.y; \ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + bias_f0 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + coord_bias.x = coord.x; \ + \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); \ + \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); \ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); \ + \ + vxc_float4 sub, norm; \ + sub = tmpData0 - mean_vari.s0; \ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(conv_type, tmpVal0, norm); \ + sub = tmpData1 - mean_vari.s0; \ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(conv_type, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_AXIS01_F16_F16to16Bits(F16,half4,vxc_half8,CONV) +LAYER_NORM_AXIS01_F16_F16to16Bits(I16,int4,vxc_short8,CONV_RTE) + + +#define LAYER_NORM_AXIS01_F16_F32to16Bits(name,temp_type,dst_type,conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F32to##name( \ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \ + image2d_array_t output, float eps) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ + int2 coord_sum = (int2)(0, gidz); \ + int4 coord_para = coord; \ + vxc_short8 src0; \ + vxc_half8 in_h; \ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + vxc_float4 mean_vari = (vxc_float4)(0); \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_sum); \ + coord_sum.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + coord_para.z = 0; \ + coord_para.w = 0; \ + int4 coord_bias = coord_para; \ + \ + int8 input_desc, scale_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr); \ + \ + vxc_float4 tmpData0, tmpData1; \ + vxc_short8 outval; \ + temp_type tmpVal0, tmpVal1; \ + dst_type dst; \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.y ++; \ + coord_bias.y = coord.y; \ + bias_f0 = read_imagef(bias, coord_bias); \ + scale_f0 = read_imagef(scale, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + scale_f1 = read_imagef(scale, coord_bias); \ + coord_bias.x = coord.x; \ + \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); \ + \ + vxc_float4 sub, norm; \ + sub = tmpData0 - mean_vari.s0; \ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(conv_type, tmpVal0, norm); \ + sub = tmpData1 - mean_vari.s0; \ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(conv_type, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_AXIS01_F16_F32to16Bits(F16,half4,vxc_half8,CONV) +LAYER_NORM_AXIS01_F16_F32to16Bits(I16,int4,vxc_short8,CONV_RTE) + +#define LAYER_NORM_AXIS01_F16_F16toQUANT(name,dst_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F16to##name( \ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \ + image2d_array_t output, float eps) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ + int2 coord_sum = (int2)(0, gidz); \ + int4 coord_para = coord; \ + vxc_short8 src0; \ + vxc_short8 src1; \ + vxc_half8 scale_h, in_h; \ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + vxc_float4 mean_vari = (vxc_float4)(0); \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_sum); \ + coord_sum.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + coord_para.z = 0; \ + coord_para.w = 0; \ + int4 coord_bias = coord_para; \ + \ + int8 input_desc, scale_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.z, baseAddr_a); \ + \ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \ + _viv_asm(MOV, coord_para.w, baseAddr_c); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr); \ + \ + vxc_float4 tmpData0, tmpData1; \ + dst_type outval; \ + vxc_int4 tmpVal0, tmpVal1; \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.y ++; \ + coord_para.y = coord.y; \ + coord_bias.y = coord.y; \ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + bias_f0 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + coord_bias.x = coord.x; \ + \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); \ + \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); \ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); \ + \ + vxc_float4 sub, norm; \ + sub = tmpData0 - mean_vari.s0; \ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + sub = tmpData1 - mean_vari.s0; \ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniExtract8Data_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_AXIS01_F16_F16toQUANT(U8,vxc_uchar16) +LAYER_NORM_AXIS01_F16_F16toQUANT(I8,vxc_char16) + +#define LAYER_NORM_AXIS01_F16_F32toQUANT(name,dst_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F32to##name( \ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \ + image2d_array_t output, float eps) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ + int2 coord_sum = (int2)(0, gidz); \ + int4 coord_para = coord; \ + vxc_short8 src0; \ + vxc_half8 in_h; \ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + vxc_float4 mean_vari = (vxc_float4)(0); \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_sum); \ + coord_sum.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + coord_para.z = 0; \ + coord_para.w = 0; \ + int4 coord_bias = coord_para; \ + \ + int8 input_desc, scale_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr); \ + \ + vxc_float4 tmpData0, tmpData1; \ + dst_type outval; \ + vxc_int4 tmpVal0, tmpVal1; \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.y ++; \ + coord_bias.y = coord.y; \ + \ + bias_f0 = read_imagef(bias, coord_bias); \ + scale_f0 = read_imagef(scale, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + scale_f1 = read_imagef(scale, coord_bias); \ + coord_bias.x = coord.x; \ + \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); \ + \ + vxc_float4 sub, norm; \ + sub = tmpData0 - mean_vari.s0; \ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + sub = tmpData1 - mean_vari.s0; \ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniExtract8Data_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_AXIS01_F16_F32toQUANT(U8,vxc_uchar16) +LAYER_NORM_AXIS01_F16_F32toQUANT(I8,vxc_char16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_3.vx new file mode 100644 index 0000000..ae812a2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_3.vx @@ -0,0 +1,178 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform int height; +_viv_uniform uint group_num; +_viv_uniform float output_zp; +_viv_uniform float output_scale; +_viv_uniform float inv_multiplier; + +#define LAYER_NORM_AXIS01_I16_F16to16Bits(name,temp_type,dst_type,conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I16_F16to##name( \ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \ + image2d_array_t output, float eps) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ + int2 coord_sum = (int2)(0, gidz); \ + int4 coord_para = coord; \ + vxc_short8 src0, src1; \ + vxc_half8 scale_h; \ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + vxc_float4 mean_vari = (vxc_float4)(0); \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_sum); \ + coord_sum.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + coord_para.z = 0; \ + coord_para.w = 0; \ + int4 coord_bias = coord_para; \ + \ + int8 input_desc, scale_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.z, baseAddr_a); \ + \ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \ + _viv_asm(MOV, coord_para.w, baseAddr_c); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr); \ + \ + vxc_float4 tmpData0, tmpData1, norm; \ + temp_type tmpVal0, tmpVal1; \ + vxc_short8 outval; \ + dst_type dst; \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.y ++; \ + coord_para.y = coord.y; \ + coord_bias.y = coord.y; \ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + bias_f0 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + coord_bias.x = coord.x; \ + \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); \ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); \ + tmpData0 = tmpData0 - mean_vari.s0; \ + tmpData1 = tmpData1 - mean_vari.s0; \ + \ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(conv_type, tmpVal0, norm); \ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(conv_type, tmpVal1, norm); \ + \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_AXIS01_I16_F16to16Bits(F16,half4,vxc_half8,CONV) +LAYER_NORM_AXIS01_I16_F16to16Bits(I16,int4,vxc_short8,CONV_RTE) + + +#define LAYER_NORM_AXIS01_I16_F32to16Bits(name,temp_type,dst_type,conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I16_F32to##name( \ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \ + image2d_array_t output, float eps) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ + int2 coord_sum = (int2)(0, gidz); \ + int4 coord_para = coord; \ + vxc_short8 src0; \ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + vxc_float4 mean_vari = (vxc_float4)(0); \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_sum); \ + coord_sum.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + coord_para.z = 0; \ + coord_para.w = 0; \ + int4 coord_bias = coord_para; \ + \ + int8 input_desc, scale_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr); \ + \ + vxc_float4 tmpData0, tmpData1, norm; \ + temp_type tmpVal0, tmpVal1; \ + vxc_short8 outval; \ + dst_type dst; \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.y ++; \ + coord_bias.y = coord.y; \ + bias_f0 = read_imagef(bias, coord_bias); \ + scale_f0 = read_imagef(scale, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + scale_f1 = read_imagef(scale, coord_bias); \ + coord_bias.x = coord.x; \ + \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniDataToFP32_1_4x4); \ + tmpData0 = tmpData0 - mean_vari.s0; \ + tmpData1 = tmpData1 - mean_vari.s0; \ + \ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(conv_type, tmpVal0, norm); \ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(conv_type, tmpVal1, norm); \ + \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_AXIS01_I16_F32to16Bits(F16,half4,vxc_half8,CONV) +LAYER_NORM_AXIS01_I16_F32to16Bits(I16,int4,vxc_short8,CONV_RTE) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_sum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_sum.vx new file mode 100644 index 0000000..9e87880 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_sum.vx @@ -0,0 +1,228 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniSumX_16x1; +_viv_uniform VXC_512Bits uniSumX2_16x1; +_viv_uniform VXC_512Bits uniSum_X_X2_8x2; +_viv_uniform int width; +_viv_uniform int height; + + + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_F16toF32( + image2d_array_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniSum_X_X2_8x2); + tmpSumSqr += sumsqr; + } + } + + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0; + float sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_I16toF32( + image2d_array_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + float4 tmpSumSqr = (float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniSum_X_X2_8x2); + tmpSumSqr += sumsqr; + } + } + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + float4 data = (float4)(0); + for(int i = 0; i < 4; i++) + { + data.x += dot(tmp_sum[i], one); + data.y += dot(tmp_sqr[i], one); + } + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_U8toF32( + image2d_array_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord.xywz, 0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); + tmpSqr += (tmpSqr1); + } + sqr += convert_float(tmpSqr); + sum = convert_float(tmpSum); + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_I8toF32( + image2d_array_t input, image2d_t output) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_char16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.w, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord.xywz, 0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); + tmpSqr += (tmpSqr1); + } + sqr += convert_float(tmpSqr); + sum = convert_float(tmpSum); + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(get_group_id(0) << 2, gidz); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0.vx new file mode 100644 index 0000000..0f673cb --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0.vx @@ -0,0 +1,190 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform float rlogE; +_viv_uniform int axisSize; +_viv_uniform float betaValue; +_viv_uniform float scaleLogE; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform int height; +_viv_uniform int inputWidth; +_viv_uniform int inputWidthRemain4; +_viv_uniform VXC_512Bits uniGetSubData0to3_4x4; +_viv_uniform VXC_512Bits uniGetSubData4to7_4x4; +_viv_uniform VXC_512Bits uniPackMaxData_2x8; + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0(read_fun, vert_max_fun, horz_max_fun) \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, val, val0, 16); \ + for (coord.y = 0; coord.y < height; coord.y++) \ + { \ + for (coord.x = 16;coord.x < (axisSize + 16);coord.x+=32) \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val1, val1, 16); \ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val2, val2, 16); \ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val3, val3, 16); \ + vert_max_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vert_max_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + horz_max_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \ + horz_max_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_float4 prob; \ + float fProbSum = 0; \ + const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \ + for (coord.y = 0; coord.y < height; coord.y++) \ + { \ + for (coord.x = 0;coord.x < inputWidth;coord.x+=4) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + VXC_DP4x4(prob, img_val0, val,\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \ + prob *= scaleLogE; \ + prob = exp2(prob); \ + fProbSum += dot(prob, one4); \ + } \ + } \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \ + prob *= scaleLogE; \ + if(inputWidthRemain4 == 1) \ + { \ + prob.x = exp2(prob.x); \ + prob.yzw = 0; \ + fProbSum += dot(prob, one4); \ + } \ + else if(inputWidthRemain4 == 2) \ + { \ + prob.x = exp2(prob.x); \ + prob.y = exp2(prob.y); \ + prob.zw = 0; \ + fProbSum += dot(prob, one4); \ + } \ + else if(inputWidthRemain4 == 3) \ + { \ + prob.x = exp2(prob.x); \ + prob.y = exp2(prob.y); \ + prob.z = exp2(prob.z); \ + prob.w = 0; \ + fProbSum += dot(prob, one4); \ + } \ + vxc_float4 probSum_log; \ + probSum_log.x = log2(fProbSum) * rlogE; + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_SAVE(dst_type, \ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \ + for (coord.y = 0; coord.y < height; coord.y++) \ + { \ + for (coord.x = 0; coord.x < axisSize; coord.x += 8) \ + { \ + dst_type vec0, vec1; \ + save_type dst; \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \ + prob = prob * betaValue - probSum_log.xxxx; \ + prob = prob * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, vec0, prob); \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \ + prob = prob * betaValue - probSum_log.xxxx; \ + prob = prob * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, vec1, prob); \ + VXC_DP2x8(dst, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } + +#define LOGSOFTMAX_EXCEED_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun, horz_max_fun) \ +__kernel void log_softmax_exceed_axis0_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(16, 0, get_global_id(1), 0); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \ +} + +LOGSOFTMAX_EXCEED_AXIS0(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_EXCEED_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_EXCEED_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_EXCEED_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_EXCEED_AXIS0(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS0(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) + + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_TOF32_SAVE(read_fun) \ + for (coord.y = 0; coord.y < height; coord.y++) \ + { \ + for (coord.x = 0; coord.x < axisSize; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \ + prob = prob * betaValue - probSum_log.xxxx; \ + write_imagef(output, coord, prob); \ + coord.x += 4; \ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \ + prob = prob * betaValue - probSum_log.xxxx; \ + write_imagef(output, coord, prob); \ + coord.x += 4; \ + } \ + } + +#define LOGSOFTMAX_EXCEED_AXIS0_TOF32(src_name, src_type, copy_type, vert_max_fun, horz_max_fun) \ +__kernel void log_softmax_exceed_axis0_##src_name##toF32 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(16, 0, get_global_id(1), 0); \ + src_type img_val0, img_val1, img_val2, img_val3; \ + copy_type val0, val1, val2, val3; \ + src_type val; \ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_TOF32_SAVE(VXC_ReadImage2DArray) \ +} + +LOGSOFTMAX_EXCEED_AXIS0_TOF32(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half) +LOGSOFTMAX_EXCEED_AXIS0_TOF32(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS0_TOF32(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS0_TOF32(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0_BF16.vx new file mode 100644 index 0000000..45f904f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0_BF16.vx @@ -0,0 +1,187 @@ +#include "cl_viv_vx_ext.h" +_viv_uniform float rlogE; +_viv_uniform int axisSize; +_viv_uniform float betaValue; +_viv_uniform float scaleLogE; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; + +_viv_uniform int height; +_viv_uniform int inputWidth; +_viv_uniform int inputWidthRemain4; +_viv_uniform VXC_512Bits uniPackMaxData_2x8; +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; + + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(read_fun) \ + vxc_half8 img_val0, img_val1, img_val2, img_val3; \ + vxc_short8 val0, val1, val2, val3; \ + vxc_half8 val; \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, val, val0, 16); \ + for (coord.y = 0; coord.y < height; coord.y++) \ + { \ + for (coord.x = 16; coord.x < (axisSize + 16);) \ + { \ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val0, val0, 16); \ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val1, val1, 16); \ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val2, val2, 16); \ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, img_val3, val3, 16); \ + coord.x += 32; \ + VXC_VertMax3_Half(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_VertMax3_Half(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \ + VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + vxc_ushort8 bf_val_tmp; \ + vxc_float4 vecA; \ + _viv_asm(COPY, bf_val_tmp, val, 16); \ + VXC_DP2x8(bf_val_tmp, bf_val_tmp, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA, bf_val_tmp, 16); \ + vxc_float4 prob; \ + float fProbSum = 0; \ + const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \ + float max_value = vecA.x * scaleLogE; \ + float max_value_orig = vecA.x; \ + for (coord.y = 0; coord.y < height; coord.y++) \ + { \ + for (coord.x = 0; coord.x < inputWidth; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob * scaleLogE - max_value; \ + prob = exp2(prob); \ + fProbSum += dot(prob, one4); \ + coord.x += 4; \ + } \ + } \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob * scaleLogE - max_value; \ + if(inputWidthRemain4 == 1) \ + { \ + prob.x = exp2(prob.x); \ + prob.yzw = 0; \ + fProbSum += dot(prob, one4); \ + } \ + else if(inputWidthRemain4 == 2) \ + { \ + prob.x = exp2(prob.x); \ + prob.y = exp2(prob.y); \ + prob.zw = 0; \ + fProbSum += dot(prob, one4); \ + } \ + else if(inputWidthRemain4 == 3) \ + { \ + prob.x = exp2(prob.x); \ + prob.y = exp2(prob.y); \ + prob.z = exp2(prob.z); \ + prob.w = 0; \ + fProbSum += dot(prob, one4); \ + } \ + vxc_float4 probSum_log; \ + probSum_log.x = log2(fProbSum) * rlogE; + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOBF16_SAVE(read_fun, write_fun) \ + for (coord.y = 0; coord.y < height; coord.y++) \ + { \ + for (coord.x = 0; coord.x < axisSize; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob - max_value_orig; \ + prob = prob * betaValue - probSum_log.xxxx; \ + vxc_ushort8 tmp, dst; \ + _viv_asm(COPY, tmp, prob, 16); \ + dst.s0123 = tmp.s1357; \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 4; \ + } \ + } + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF16_SAVE(read_fun, write_fun) \ + for (coord.y = 0; coord.y < height; coord.y++) \ + { \ + for (coord.x = 0; coord.x < axisSize; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob - max_value_orig; \ + prob = prob * betaValue - probSum_log.xxxx; \ + half4 vec; \ + vxc_half4 tmp; \ + vxc_short4 dst; \ + _viv_asm(CONV, vec, prob); \ + VXC_DP4x4(tmp, vec, vec, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \ + _viv_asm(COPY, dst, tmp, 8); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 4; \ + } \ + } + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF32_SAVE(read_fun) \ + for (coord.y = 0; coord.y < height; coord.y++) \ + { \ + for (coord.x = 0; coord.x < axisSize; ) \ + { \ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(bf_val_tmp, val0, zero,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, prob, bf_val_tmp, 16); \ + prob = prob - max_value_orig; \ + prob = prob * betaValue - probSum_log.xxxx; \ + write_imagef(output, coord, prob); \ + coord.x += 4; \ + } \ + } + +__kernel void log_softmax_exceed_axis0_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(16, 0, get_global_id(1), 0); + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray) + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOBF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} +__kernel void log_softmax_exceed_axis0_BF16toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(16, 0, get_global_id(1), 0); + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray) + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray) +} +__kernel void log_softmax_exceed_axis0_BF16toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(16, 0, get_global_id(1), 0); + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray) + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF32_SAVE(VXC_ReadImage2DArray) +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1.vx new file mode 100644 index 0000000..179735c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1.vx @@ -0,0 +1,172 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float rlogE; +_viv_uniform int depth; +_viv_uniform int axisSize; +_viv_uniform float betaValue; +_viv_uniform float scaleLogE; +_viv_uniform float outputScale; +_viv_uniform float output_offset_asymmetric; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniGetSubLoData_4x4; +_viv_uniform VXC_512Bits uniGetSubHiData_4x4; + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1(read_fun, vert_max_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, max, in0, 16); \ + for (coord.z = 0; coord.z < depth; coord.z ++) \ + { \ + for (coord.y = 0; coord.y < axisSize;) \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + } \ + } \ + coord.y = 0; \ + sum0 = 0; \ + sum1 = 0; \ + for (coord.z = 0; coord.z < depth; coord.z ++) \ + { \ + for (coord.y = 0; coord.y < axisSize;coord.y++) \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \ + VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 *= scaleLogE; \ + data0 = exp2(data0); \ + sum0 += data0; \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \ + VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 *= scaleLogE; \ + data0 = exp2(data0); \ + sum1 += data0; \ + } \ + } \ + sum0 = log2(sum0) * rlogE; \ + sum1 = log2(sum1) * rlogE; + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \ + coord.y = 0; \ + dst_type dst0, dst1; \ + save_type vect; \ + for (coord.z = 0; coord.z < depth; coord.z ++) \ + { \ + for (coord.y = 0; coord.y < axisSize;) \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \ + VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 = data0 * betaValue - sum0; \ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst0, data0); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \ + VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 = data0 * betaValue - sum1; \ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \ + _viv_asm(conv_mode, dst1, data0); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, \ + VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + } \ + } \ + +#define LOGSOFTMAX_EXCEED_AXIS1(src_name, dst_name, src_type, copy_type, dst_type,\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \ +__kernel void log_softmax_exceed_axis1_##src_name##to##dst_name \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, 0, 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + vxc_float4 sum0, sum1; \ + LOGSOFTMAX_PROCESS_EXCEED_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \ + LOGSOFTMAX_PROCESS_EXCEED_AXIS1_SAVE(dst_type, save_type, conv_mode,\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \ +} + + + +LOGSOFTMAX_EXCEED_AXIS1(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Half) +LOGSOFTMAX_EXCEED_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half) +LOGSOFTMAX_EXCEED_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half) +LOGSOFTMAX_EXCEED_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4,\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half) + +LOGSOFTMAX_EXCEED_AXIS1(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer) + +LOGSOFTMAX_EXCEED_AXIS1(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\ +CONV, 1, 0, VXC_VertMax3_Integer) + +LOGSOFTMAX_EXCEED_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4,\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, \ +vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer) + + + +#define LOGSOFTMAX_EXCEED_AXIS1_TOF32(src_name, src_type, copy_type, vert_max_fun) \ +__kernel void log_softmax_exceed_axis1_##src_name##toF32 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float input_Scale, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, 0, 0); \ + src_type vec0, max; \ + copy_type in0; \ + vxc_float4 data0; \ + vxc_float4 sum0, sum1; \ + LOGSOFTMAX_PROCESS_EXCEED_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \ + coord.y = 0; \ + for (coord.z = 0; coord.z < depth; coord.z ++) \ + { \ + for (coord.y = 0; coord.y < axisSize;) \ + { \ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \ + VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \ + data0 = data0 * betaValue - sum0; \ + write_imagef(output, coord, data0); \ + coord.x += 4; \ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \ + VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \ + data0 = data0 * betaValue - sum1; \ + write_imagef(output, coord, data0); \ + coord.x -= 4; \ + coord.y++; \ + } \ + } \ +} + +LOGSOFTMAX_EXCEED_AXIS1_TOF32(F16, vxc_half8, \ +vxc_short8, VXC_VertMax3_Half) +LOGSOFTMAX_EXCEED_AXIS1_TOF32(I16, vxc_short8, \ +vxc_short8, VXC_VertMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS1_TOF32(I8, vxc_char16, \ +vxc_char16, VXC_VertMax3_Integer) +LOGSOFTMAX_EXCEED_AXIS1_TOF32(U8, vxc_uchar16, \ +vxc_uchar16, VXC_VertMax3_Integer) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1_BF16.vx new file mode 100644 index 0000000..f592e31 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1_BF16.vx @@ -0,0 +1,180 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float rlogE; +_viv_uniform int depth; +_viv_uniform int axisSize; +_viv_uniform float betaValue; +_viv_uniform float scaleLogE; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; + +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(read_fun) \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, max, in0, 16); \ + for (coord.z = 0; coord.z < depth; coord.z ++) \ + { \ + for (coord.y = 0; coord.y < axisSize;) \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, vec0, in0, 16); \ + VXC_VertMax3_Half(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + } \ + } \ + _viv_asm(COPY, tmp0, max, 16); \ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, max_lo, tmp1, 16); \ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, max_hi, tmp1, 16); \ + coord.y = 0; \ + sum0 = 0; \ + sum1 = 0; \ + for (coord.z = 0; coord.z < depth; coord.z ++) \ + { \ + for (coord.y = 0; coord.y < axisSize;) \ + { \ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data0, tmp1, 16); \ + data0 = data0 - max_lo; \ + data0 *= scaleLogE; \ + sum0 += exp2(data0); \ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, data0, tmp1, 16); \ + data0 = data0 - max_hi; \ + data0 *= scaleLogE; \ + sum1 += exp2(data0); \ + coord.y++; \ + } \ + } \ + sum0 = log2(sum0) * rlogE; \ + sum1 = log2(sum1) * rlogE; + +__kernel void log_softmax_exceed_axis1_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(get_global_id(0), 0, 0, 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray) + + coord.y = 0; + vxc_ushort8 dst0, dst1, dst; + for (coord.z = 0; coord.z < depth; coord.z ++) + { + for (coord.y = 0; coord.y < axisSize;) + { + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + _viv_asm(COPY, dst0, data0, 16); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + _viv_asm(COPY, dst1, data0, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + } + } +} + +__kernel void log_softmax_exceed_axis1_BF16toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(get_global_id(0), 0, 0, 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray) + + coord.y = 0; + half4 dst0, dst1; + for (coord.z = 0; coord.z < depth; coord.z ++) + { + for (coord.y = 0; coord.y < axisSize;) + { + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + _viv_asm(CONV, dst0, data0); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + _viv_asm(CONV, dst1, data0); + VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); + vxc_short8 vect; + _viv_asm(COPY, vect, vec0, 16); + VXC_WriteImage2DArray(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + } + } +} + +__kernel void log_softmax_exceed_axis1_BF16toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float input_Scale, + int axisVal ) +{ + int4 coord = (int4)(get_global_id(0), 0, 0, 0); + vxc_short8 in0; + vxc_half8 vec0, max; + vxc_float4 data0; + vxc_float4 sum0, sum1; + vxc_float4 max_lo, max_hi; + vxc_ushort8 tmp0, tmp1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray) + + coord.y = 0; + for (coord.z = 0; coord.z < depth; coord.z ++) + { + for (coord.y = 0; coord.y < axisSize;) + { + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_lo; + data0 = data0 * betaValue - sum0; + write_imagef(output, coord, data0); + coord.x += 4; + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, tmp1, 16); + data0 = data0 - max_hi; + data0 = data0 * betaValue - sum1; + write_imagef(output, coord, data0); + coord.x -= 4; + coord.y++; + } + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_copy.vx new file mode 100644 index 0000000..ae92f69 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_copy.vx @@ -0,0 +1,111 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +_viv_uniform float outputScaleVar_b; +_viv_uniform float outputScaleVar_g; +_viv_uniform float outputScaleVar_r; + +_viv_uniform float bMeanScaleVarZp; +_viv_uniform float gMeanScaleVarZp; +_viv_uniform float rMeanScaleVarZp; + +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; + +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; +_viv_uniform VXC_512Bits uniExtractYtoShortSub16_2x8; +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4; + +#define NV12_RGGB_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ +__kernel void pre_process_nv12_rggb_copy_##name \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t uv_img, \ + __write_only image2d_array_t output, \ + global int* xRatio, \ + global int* yRatio, \ + global int* xOffset, \ + global int* yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse_channel, \ + int trans, \ + int nv_type, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int sy = gidy + (*yOffset); \ + int sx = gidx + (*xOffset); \ + int uvX = sx & 0xfffffffe; \ + int uvY = sy >> 1; \ + \ + vxc_uchar16 Y, UV; \ + \ + VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + if (nv_type == 3) \ + { \ + UV.s0123 = UV.s1032; \ + } \ + \ + vxc_short8 tmpY; \ + vxc_char16 tmpUV; \ + short tmpVal = 16; \ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_2x8); \ + tmpVal = 128; \ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \ + \ + float4 tmpDstB, tmpDstG, tmpDstR; \ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + \ + conv_type result; \ + dst_type dst0; \ + save_type dst; \ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstB); \ + dstPos.z = bOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstG); \ + dstPos.z = 1; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + dstPos.z = 2; \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstR); \ + dstPos.z = rOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +NV12_RGGB_COPY_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8) +NV12_RGGB_COPY_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8) +NV12_RGGB_COPY_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16) +NV12_RGGB_COPY_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_scale.vx new file mode 100644 index 0000000..ade2a15 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_scale.vx @@ -0,0 +1,247 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int bOrder; +_viv_uniform int rOrder; + +_viv_uniform float outputScaleVar_b; +_viv_uniform float outputScaleVar_g; +_viv_uniform float outputScaleVar_r; + +_viv_uniform float bMeanScaleVarZp; +_viv_uniform float gMeanScaleVarZp; +_viv_uniform float rMeanScaleVarZp; + +_viv_uniform uint xrIntFloat_16; +_viv_uniform uint yrIntFloat_16; + +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4; +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4; + +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; + +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; +_viv_uniform VXC_512Bits uniConvertYtoShortSub16_2x8; + +_viv_uniform VXC_512Bits uniCalculateYShift_2x8; +_viv_uniform VXC_512Bits uniCalculateUVShift_2x8; +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4; + +#define NV12_RGGB_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ +__kernel void pre_process_nv12_rggb_scale_##name##_gq \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t uv_img, \ + __write_only image2d_array_t output, \ + global int* xRatio, \ + global int* yRatio, \ + global int* xOffset, \ + global int* yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse_channel, \ + int trans, \ + int nv_type, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + uint4 gidx = get_global_id(0); \ + uint gidy = get_global_id(1); \ + gidx += (uint4)(0, 1, 2, 3); \ + \ + uint dy = (gidy * yrIntFloat_16) >> 16; \ + uint4 dx = (gidx * xrIntFloat_16) >> 16; \ + int sy = convert_int(dy) + (*yOffset); \ + int4 sx = convert_int4(dx) + (*xOffset); \ + int4 uvX = sx & 0xfffffffe; \ + int uvY = sy >> 1; \ + \ + vxc_uchar16 Y, UV; \ + int2 coord = (int2)(sx.x, sy); \ + int2 coord_uv = (int2)(uvX.x, uvY); \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + if (nv_type == 3) \ + { \ + UV.s0123456789abcdef = UV.s1032547698badcfe; \ + } \ + \ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \ + vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \ + int4 offsetUV = uvX - uvX.x; \ + \ + vxc_ushort8 diffY, diffUV; \ + _viv_asm(COPY, diffY, sx, 16); \ + _viv_asm(COPY, diffUV, offsetUV, 16); \ + \ + vxc_ushort8 constData = 8; \ + VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniCalculateYShift_2x8); \ + VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniCalculateUVShift_2x8); \ + VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_short8 tmpY; \ + vxc_char16 tmpUV; \ + short tmpVal = 16; \ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \ + tmpVal = 128; \ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \ + \ + float4 tmpDstB, tmpDstG, tmpDstR; \ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + \ + conv_type result; \ + dst_type dst0; \ + save_type dst; \ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstB); \ + dstPos.z = bOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstG); \ + dstPos.z = 1; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + dstPos.z = 2; \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstR); \ + dstPos.z = rOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +NV12_RGGB_OPT_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8) +NV12_RGGB_OPT_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8) +NV12_RGGB_OPT_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16) +NV12_RGGB_OPT_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16) + +#define NV12_RGGB_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ +__kernel void pre_process_nv12_rggb_scale_##name \ + ( \ + __read_only image2d_array_t y_img, \ + __read_only image2d_array_t uv_img, \ + __write_only image2d_array_t output, \ + global int* xRatio, \ + global int* yRatio, \ + global int* xOffset, \ + global int* yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse_channel, \ + int trans, \ + int nv_type, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + uint4 gidx = get_global_id(0); \ + uint gidy = get_global_id(1); \ + gidx += (uint4)(0, 1, 2, 3); \ + \ + uint dy = (gidy * yrIntFloat_16) >> 16; \ + uint4 dx = (gidx * xrIntFloat_16) >> 16; \ + int sy = convert_int(dy) + (*yOffset); \ + int4 sx = convert_int4(dx) + (*xOffset); \ + int4 uvX = sx & 0xfffffffe; \ + int uvY = sy >> 1; \ + \ + vxc_uchar16 Y, UV; \ + int2 coord = (int2)(sx.x, sy); \ + int2 coord_uv = (int2)(uvX.x, uvY); \ + \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord.x = sx.y; \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord.x = sx.z; \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord.x = sx.w; \ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_uv.x = uvX.y; \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_uv.x = uvX.z; \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_uv.x = uvX.w; \ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + if (nv_type == 3) \ + { \ + UV.s01234567 = UV.s10325476; \ + } \ + \ + vxc_short8 tmpY; \ + vxc_char16 tmpUV; \ + short tmpVal = 16; \ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \ + tmpVal = 128; \ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \ + \ + float4 tmpDstB, tmpDstG, tmpDstR; \ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + \ + conv_type result; \ + dst_type dst0; \ + save_type dst; \ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstB); \ + dstPos.z = bOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstG); \ + dstPos.z = 1; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + dstPos.z = 2; \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \ + _viv_asm(CONV_RTE, result, tmpDstR); \ + dstPos.z = rOrder; \ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst0, copy_bytes); \ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +NV12_RGGB_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8) +NV12_RGGB_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8) +NV12_RGGB_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16) +NV12_RGGB_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx index 2546ca5..5c09554 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx @@ -89,80 +89,79 @@ __kernel void resize_bilinear_F16toF16_DOWN VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } -__kernel void resize_bilinear_F16toU8_DOWN - ( - __read_only image2d_array_t input, - __write_only image2d_array_t output, - int align_corners, - int half_pixel_centers - ) -{ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); - float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; - float4 left_x_f = floor(in_x); - float4 x_lerp = in_x - left_x_f; - int4 left_x_idx = convert_int4(left_x_f); - float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; - float top_y_f = floor(in_y); - float y_lerp = in_y - top_y_f; - int top_y_idx = convert_int(top_y_f); - - vxc_short8 top_short, bottom_short; - vxc_half8 top, bottom; - int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.y; - VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.z; - VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); - coord_in.x = left_x_idx.w; - VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, top, top_short, 16); - _viv_asm(COPY, bottom, bottom_short, 16); - - float4 left4; - float4 right4; - float4 top4; - float4 bottom4; - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); - top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); - VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); - bottom4 = right4 * x_lerp + left4; - bottom4 -= top4; - float4 dst4 = bottom4 * y_lerp + top4; - dst4 = dst4 * uint8Scale + output_ZP; - int4 dst = convert_int4_rte(dst4); - vxc_uchar8 dst_uchar; - VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); - - int8 output_desc; - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.w, baseAddr); - - VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar, - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +#define RESIZE_BILINEAR_F16TOQINT_DOWN(out_name, dst_type) \ +__kernel void resize_bilinear_F16to##out_name##_DOWN( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int align_corners, \ + int half_pixel_centers \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); \ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; \ + float4 left_x_f = floor(in_x); \ + float4 x_lerp = in_x - left_x_f; \ + int4 left_x_idx = convert_int4(left_x_f); \ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; \ + float top_y_f = floor(in_y); \ + float y_lerp = in_y - top_y_f; \ + int top_y_idx = convert_int(top_y_f); \ + \ + vxc_short8 top_short, bottom_short; \ + vxc_half8 top, bottom; \ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = left_x_idx.y; \ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = left_x_idx.z; \ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = left_x_idx.w; \ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, top, top_short, 16); \ + _viv_asm(COPY, bottom, bottom_short, 16); \ + \ + float4 left4, right4, top4, bottom4; \ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); \ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + top4 = right4 * x_lerp + left4; \ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); \ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \ + bottom4 = right4 * x_lerp + left4; \ + bottom4 -= top4; \ + float4 dst4 = bottom4 * y_lerp + top4; \ + dst4 = dst4 * uint8Scale + output_ZP; \ + int4 dst = convert_int4_rte(dst4); \ + dst_type dst_uchar; \ + VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ } +RESIZE_BILINEAR_F16TOQINT_DOWN(U8, vxc_uchar8) +RESIZE_BILINEAR_F16TOQINT_DOWN(U16, vxc_ushort8) __kernel void resize_bilinear_F16toF16_UP ( diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx index 8f4735b..b195ee7 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx @@ -1,13 +1,15 @@ #include "cl_viv_vx_ext.h" +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4; _viv_uniform VXC_512Bits uniExtact8Bit_2x8; _viv_uniform float2 scale_xy; _viv_uniform int depth; +_viv_uniform int input_ZP; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; _viv_uniform VXC_512Bits uniConvertI32toI16_2x8; _viv_uniform VXC_512Bits uniGetMaskShift_2x8; -_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4; -_viv_uniform VXC_512Bits uniRightSubLeft_4x4; -_viv_uniform float dfpScale; _viv_uniform float half_pixel_value; __kernel void resize_bilinear_I16toI16_UP @@ -56,23 +58,23 @@ __kernel void resize_bilinear_I16toI16_UP VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + short inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + vxc_ushort8 bitextract_p0; vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); vxc_ushort8 constData = 16; VXC_DP2x8(maskShift, bitextract_p0, constData, \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); int8 output_desc; _viv_asm(COPY, output_desc, output, sizeof(output_desc)); baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_out.w, baseAddr); - float4 left4; - float4 right4; - float4 top4; - float4 bottom4; + float4 left4, right4, top4, bottom4; int loop = depth - 1; while (coord_in.z < loop) @@ -91,18 +93,18 @@ __kernel void resize_bilinear_I16toI16_UP VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); VXC_DP4x4(right4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; - dst4 = dst4 * dfpScale; + dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); @@ -115,17 +117,17 @@ __kernel void resize_bilinear_I16toI16_UP VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, top, dst0, 16); _viv_asm(COPY, bottom, dst1, 16); - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); VXC_DP4x4(right4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; - dst4 = dst4 * dfpScale; + dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); @@ -180,25 +182,25 @@ __kernel void resize_bilinear_I16toI16_DOWN VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); - float4 left4; - float4 right4; - float4 top4; - float4 bottom4; + short inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); - VXC_DP4x4(left4, top, top, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); - VXC_DP4x4(right4, top, top, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + float4 left4, right4, top4, bottom4; + + VXC_DP4x4(left4, top, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); - VXC_DP4x4(right4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; - dst4 = dst4 * dfpScale; + dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); @@ -209,6 +211,6 @@ __kernel void resize_bilinear_I16toI16_DOWN _viv_asm(MOV, coord_out.w, baseAddr); VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx index bcb465e..1364370 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx @@ -1,13 +1,15 @@ #include "cl_viv_vx_ext.h" +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4; _viv_uniform VXC_512Bits uniExtact8Bit_2x8; _viv_uniform float2 scale_xy; _viv_uniform int depth; +_viv_uniform int input_ZP; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; _viv_uniform VXC_512Bits uniConvertI32toI16_2x8; _viv_uniform VXC_512Bits uniGetMaskShift_2x8; -_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4; -_viv_uniform VXC_512Bits uniRightSubLeft_4x4; -_viv_uniform float dfpScale; _viv_uniform float half_pixel_value; __kernel void resize_bilinear_I8toI8_UP @@ -52,13 +54,16 @@ __kernel void resize_bilinear_I8toI8_UP VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + short inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + vxc_ushort8 bitextract_p0; vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); vxc_ushort8 constData = 8; VXC_DP2x8(maskShift, bitextract_p0, constData, \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); int8 output_desc; _viv_asm(COPY, output_desc, output, sizeof(output_desc)); @@ -84,22 +89,22 @@ __kernel void resize_bilinear_I8toI8_UP VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(left4, top, top, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(left4, top, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); VXC_DP4x4(right4, top, top, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); VXC_DP4x4(right4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; - dst4 = dst4 * dfpScale; + dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); @@ -111,19 +116,19 @@ __kernel void resize_bilinear_I8toI8_UP VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, top, dst0, 16); _viv_asm(COPY, bottom, dst1, 16); - VXC_DP4x4(left4, top, top, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(left4, top, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); VXC_DP4x4(right4, top, top, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); VXC_DP4x4(right4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; - dst4 = dst4 * dfpScale; + dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); @@ -174,25 +179,28 @@ __kernel void resize_bilinear_I8toI8_DOWN VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + short inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + float4 left4; float4 right4; float4 top4; float4 bottom4; - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; - VXC_DP4x4(left4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); - VXC_DP4x4(right4, bottom, bottom, \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); bottom4 = right4 * x_lerp + left4; bottom4 -= top4; float4 dst4 = bottom4 * y_lerp + top4; - dst4 = dst4 * dfpScale; + dst4 = dst4 * uint8Scale + output_ZP; int4 dst = convert_int4_rte(dst4); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U16.vx new file mode 100644 index 0000000..46cdb40 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U16.vx @@ -0,0 +1,278 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4; +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4; +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float2 scale_xy; +_viv_uniform int depth; +_viv_uniform int input_ZP; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8; +_viv_uniform VXC_512Bits uniGetMaskShift_2x8; +_viv_uniform float half_pixel_value; + +__kernel void resize_bilinear_U16toF16_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + vxc_ushort8 top, bottom; + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + float4 left4, right4, top4, bottom4; + + short inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + + dst4 *= uint8Scale; + + half4 dst; + _viv_asm(CONV, dst, dst4); + + vxc_short8 dst_short; + _viv_asm(COPY, dst_short, dst, 16); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_short.s0246, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_bilinear_U16toU16_UP + ( + image2d_array_t input, + image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float4 right_x_f = ceil(in_x); + int4 right_x_idx = convert_int4(right_x_f); + + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + + vxc_ushort8 src0, src1, src2, src3; + + vxc_ushort8 top; + vxc_ushort8 bottom; + + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + short inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + + vxc_ushort8 bitextract_p0; + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8); + vxc_ushort8 constData = 16; + VXC_DP2x8(maskShift, bitextract_p0, constData, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 left4, right4, top4, bottom4; + + int loop = depth - 1; + while (coord_in.z < loop) + { + VXC_BitExtract(top, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(bottom, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.zw += (int2)(1, input_desc.s4); + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + dst4 = dst4 * uint8Scale + output_ZP; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_out.zw += (int2)(1, output_desc.s4); + } + + VXC_BitExtract(top, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_BitExtract(bottom, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom, inputZP, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, bottom, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + dst4 = dst4 * uint8Scale + output_ZP; + int4 dst = convert_int4_rte(dst4); + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void resize_bilinear_U16toU16_DOWN + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; + float4 left_x_f = floor(in_x); + float4 x_lerp = in_x - left_x_f; + int4 left_x_idx = convert_int4(left_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; + float top_y_f = floor(in_y); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + vxc_ushort8 top, bottom, result; + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.y; + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.z; + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); + coord_in.x = left_x_idx.w; + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); + + float4 left4, right4, top4, bottom4; + + short inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + top4 = right4 * x_lerp + left4; + + VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); + VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + bottom4 = right4 * x_lerp + left4; + + bottom4 -= top4; + float4 dst4 = bottom4 * y_lerp + top4; + + dst4 = dst4 * uint8Scale + output_ZP; + int4 dst = convert_int4_rte(dst4); + + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx index 88f0cd5..12ae503 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx @@ -133,6 +133,9 @@ __kernel void resize_bilinear_U8toU8_UP VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + short inputZP; + _viv_asm(COPY, inputZP, input_ZP, 4); + vxc_ushort8 bitextract_p0; vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \ @@ -163,8 +166,6 @@ __kernel void resize_bilinear_U8toU8_UP VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - unsigned char inputZP; - _viv_asm(COPY, inputZP, input_ZP, 4); VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; @@ -185,8 +186,7 @@ __kernel void resize_bilinear_U8toU8_UP VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - unsigned char inputZP; - _viv_asm(COPY, inputZP, input_ZP, 4); + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); top4 = right4 * x_lerp + left4; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_cubic.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_cubic.vx new file mode 100644 index 0000000..4424c94 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_cubic.vx @@ -0,0 +1,270 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable + +#include "cl_viv_vx_ext.h" + +_viv_uniform float input_scale; +_viv_uniform float input_tail; +_viv_uniform float output_scale; +_viv_uniform float output_tail; +_viv_uniform VXC_512Bits uniFp16ToFp32_4x4; +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; +_viv_uniform VXC_512Bits uniExtract8Bit_2x8; + +#define RESIZE_CUBIC_PART0 \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_index = coord_out; \ + int2 coord_scalew = (int2)(4 * get_global_id(0), 0); \ + int2 coord_scaleh = (int2)(4 * get_global_id(1), 0); \ + float4 cubic_coeffs_y; \ + float4 cubic_coeffs_x; \ + int4 coord_in = (int4)(0, 0, coord_out.z, 0); \ + float4 src0_f,src1_f,src2_f,src3_f; \ + float4 dst = (float4)(0,0,0,0); \ + float sum[4]; \ + int i = 0; \ + \ + Image scalew = create_image_from_image2d(scale_w, 4); \ + Image scaleh = create_image_from_image2d(scale_h, 4); \ + \ + uchar* scale_w_ptr = get_image_ptr_from_coord(scalew, coord_scalew); \ + __global float* scale_x = (__global float*)scale_w_ptr; \ + \ + uchar* scale_h_ptr = get_image_ptr_from_coord(scaleh, coord_scaleh); \ + __global float* scale_y = (__global float*)scale_h_ptr; \ + cubic_coeffs_y = vload4(0, scale_y); \ + \ + int index_y = read_imagei(index_h, coord_index.yw).x; \ + coord_in.y = index_y; \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); + +#define RESIZE_CUBIC_16Bitsto16Bits(name,src_type,dst_type,temp_type) \ +__kernel void resize_cubic_##name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + __read_only image2d_t scale_w, \ + __read_only image2d_t scale_h, \ + __read_only image2d_t index_w, \ + __read_only image2d_t index_h \ + ) \ +{ \ + RESIZE_CUBIC_PART0; \ + src_type src0_h,src1_h,src2_h,src3_h; \ + vxc_short4 src0,src1,src2,src3; \ + for (i = 0; i < 4; i++) \ + { \ + coord_in.x = read_imagei(index_w, coord_index.xw).x; \ + cubic_coeffs_x = vload4(i, scale_x); \ + coord_index.x = coord_index.x + 1; \ + \ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + _viv_asm(COPY, src0_h, src0, 8); \ + _viv_asm(COPY, src1_h, src1, 8); \ + _viv_asm(COPY, src2_h, src2, 8); \ + _viv_asm(COPY, src3_h, src3, 8); \ + \ + VXC_DP4x4(src0_f, src0_h, src0_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src1_f, src1_h, src1_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src2_f, src2_h, src2_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src3_f, src3_h, src3_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + \ + dst = src0_f * cubic_coeffs_y.x \ + + src1_f * cubic_coeffs_y.y \ + + src2_f * cubic_coeffs_y.z \ + + src3_f * cubic_coeffs_y.w; \ + sum[i] = dot(dst, cubic_coeffs_x); \ + } \ + float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \ + temp_type tmpout; \ + _viv_asm(CONV,tmpout,sum_f); \ + dst_type out_h; \ + vxc_short4 out; \ + VXC_DP2x8(out_h, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \ + _viv_asm(COPY, out, out_h, 8); \ + \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +RESIZE_CUBIC_16Bitsto16Bits(F16toF16,vxc_half4, vxc_half4, half4) +RESIZE_CUBIC_16Bitsto16Bits(I16toI16,vxc_short4,vxc_short4,short4) +RESIZE_CUBIC_16Bitsto16Bits(F16toI16,vxc_half4, vxc_short4,short4) +RESIZE_CUBIC_16Bitsto16Bits(I16toF16,vxc_short4,vxc_half4, half4) + + +#define RESIZE_CUBIC_Quant8toQuant8(name,data_type) \ +__kernel void resize_cubic_##name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + __read_only image2d_t scale_w, \ + __read_only image2d_t scale_h, \ + __read_only image2d_t index_w, \ + __read_only image2d_t index_h \ + ) \ +{ \ + RESIZE_CUBIC_PART0; \ + data_type src0,src1,src2,src3; \ + for (i = 0; i < 4; i++) \ + { \ + coord_in.x = read_imagei(index_w, coord_index.xw).x; \ + cubic_coeffs_x = vload4(i, scale_x); \ + coord_index.x = coord_index.x + 1; \ + \ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src2_f, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src3_f, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + src0_f = src0_f * input_scale + input_tail; \ + src1_f = src1_f * input_scale + input_tail; \ + src2_f = src2_f * input_scale + input_tail; \ + src3_f = src3_f * input_scale + input_tail; \ + \ + dst = src0_f * cubic_coeffs_y.x \ + + src1_f * cubic_coeffs_y.y \ + + src2_f * cubic_coeffs_y.z \ + + src3_f * cubic_coeffs_y.w; \ + sum[i] = dot(dst, cubic_coeffs_x); \ + sum[i] = sum[i] * output_scale + output_tail; \ + } \ + float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \ + int4 tmpout; \ + _viv_asm(CONV,tmpout,sum_f); \ + data_type out; \ + VXC_DP2x8(out, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \ + \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +RESIZE_CUBIC_Quant8toQuant8(U8toU8,vxc_uchar4) +RESIZE_CUBIC_Quant8toQuant8(I8toI8,vxc_char4 ) + +#define RESIZE_CUBIC_F16toQuant8(name,dst_type) \ +__kernel void resize_cubic_##name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + __read_only image2d_t scale_w, \ + __read_only image2d_t scale_h, \ + __read_only image2d_t index_w, \ + __read_only image2d_t index_h \ + ) \ +{ \ + RESIZE_CUBIC_PART0; \ + vxc_half4 src0_h,src1_h,src2_h,src3_h; \ + vxc_short4 src0,src1,src2,src3; \ + for (i = 0; i < 4; i++) \ + { \ + coord_in.x = read_imagei(index_w, coord_index.xw).x; \ + cubic_coeffs_x = vload4(i, scale_x); \ + coord_index.x = coord_index.x + 1; \ + \ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + _viv_asm(COPY, src0_h, src0, 8); \ + _viv_asm(COPY, src1_h, src1, 8); \ + _viv_asm(COPY, src2_h, src2, 8); \ + _viv_asm(COPY, src3_h, src3, 8); \ + \ + VXC_DP4x4(src0_f, src0_h, src0_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src1_f, src1_h, src1_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src2_f, src2_h, src2_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src3_f, src3_h, src3_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + \ + dst = src0_f * cubic_coeffs_y.x \ + + src1_f * cubic_coeffs_y.y \ + + src2_f * cubic_coeffs_y.z \ + + src3_f * cubic_coeffs_y.w; \ + sum[i] = dot(dst, cubic_coeffs_x); \ + sum[i] = sum[i] * output_scale + output_tail; \ + } \ + float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \ + int4 tmpout; \ + _viv_asm(CONV,tmpout,sum_f); \ + dst_type out; \ + VXC_DP2x8(out, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \ + \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +RESIZE_CUBIC_F16toQuant8(F16toU8,vxc_uchar4) +RESIZE_CUBIC_F16toQuant8(F16toI8,vxc_char4) + +#define RESIZE_CUBIC_Quant8toF16(name,src_type) \ +__kernel void resize_cubic_##name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + __read_only image2d_t scale_w, \ + __read_only image2d_t scale_h, \ + __read_only image2d_t index_w, \ + __read_only image2d_t index_h \ + ) \ +{ \ + RESIZE_CUBIC_PART0; \ + src_type src0,src1,src2,src3; \ + for (i = 0; i < 4; i++) \ + { \ + coord_in.x = read_imagei(index_w, coord_index.xw).x; \ + cubic_coeffs_x = vload4(i, scale_x); \ + coord_index.x = coord_index.x + 1; \ + \ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src2_f, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + VXC_DP4x4(src3_f, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \ + \ + src0_f = src0_f * input_scale + input_tail; \ + src1_f = src1_f * input_scale + input_tail; \ + src2_f = src2_f * input_scale + input_tail; \ + src3_f = src3_f * input_scale + input_tail; \ + \ + dst = src0_f * cubic_coeffs_y.x \ + + src1_f * cubic_coeffs_y.y \ + + src2_f * cubic_coeffs_y.z \ + + src3_f * cubic_coeffs_y.w; \ + sum[i] = dot(dst, cubic_coeffs_x); \ + } \ + float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \ + half4 tmpout; \ + _viv_asm(CONV,tmpout,sum_f); \ + vxc_half4 out_h; \ + vxc_short4 out; \ + VXC_DP2x8(out_h, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \ + _viv_asm(COPY, out, out_h, 8); \ + \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +RESIZE_CUBIC_Quant8toF16(U8toF16,vxc_uchar4) +RESIZE_CUBIC_Quant8toF16(I8toF16,vxc_char4) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction.vx new file mode 100644 index 0000000..a10393f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction.vx @@ -0,0 +1,259 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int update_width; +_viv_uniform int output_width; + +_viv_uniform int4 coord_stride; +_viv_uniform int4 coord_stride1; + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; + +_viv_uniform int input_zp; +_viv_uniform float input_scale; +_viv_uniform int update_zp; +_viv_uniform float update_scale; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndU8SubZpToFp32_4x4; + +inline void AtomicAdd_float(volatile __global float *source, const float operand) +{ + union + { + unsigned int intVal; + float floatVal; + } newVal; + union + { + unsigned int intVal; + float floatVal; + } prevVal; + do + { + prevVal.floatVal = *source; + newVal.floatVal = prevVal.floatVal + operand; + } while(atomic_cmpxchg((volatile __global unsigned int *)source, + prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +inline void AtomicMul_float(volatile __global float *source, const float operand) +{ + union + { + unsigned int intVal; + float floatVal; + } newVal; + union + { + unsigned int intVal; + float floatVal; + } prevVal; + do + { + prevVal.floatVal = *source; + newVal.floatVal = prevVal.floatVal * operand; + } while(atomic_cmpxchg((volatile __global unsigned int *)source, + prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +inline void AtomicMax_float(volatile __global float *source, const float operand) +{ + union + { + unsigned int intVal; + float floatVal; + } newVal; + union + { + unsigned int intVal; + float floatVal; + } prevVal; + do + { + prevVal.floatVal = *source; + newVal.floatVal = fmax(prevVal.floatVal, operand); + } while(atomic_cmpxchg((volatile __global unsigned int *)source, + prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +inline void AtomicMin_float(volatile __global float *source, const float operand) +{ + union + { + unsigned int intVal; + float floatVal; + } newVal; + union + { + unsigned int intVal; + float floatVal; + } prevVal; + do + { + prevVal.floatVal = *source; + newVal.floatVal = fmin(prevVal.floatVal, operand); + } while(atomic_cmpxchg((volatile __global unsigned int *)source, + prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +#define SCATTER_REDUCTION_PREPROCESS(name0, ptr0, type0, len0, size0, ptr2) \ +__kernel void scatter_nd_update_reduction_preprocess_##name0( \ + __read_only image2d_t input_ref, \ + image2d_t temp_buf_float, \ + int length, int res) \ +{ \ + int gidx = get_global_id(0); \ + Image img1 = create_image_from_image2d(input_ref, size0); \ + Image img2 = create_image_from_image2d(temp_buf_float, 4); \ + __global float* tmp_ref_ptr = (__global float*)img2.ptr; \ + type0 src; \ + float4 tmpDst0, tmpDst1; \ + short zp = input_zp; \ + if(length > 0) \ + { \ + __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \ + ptr0 tmpData = input_ptr[gidx]; \ + int loc2 = gidx * 8; \ + _viv_asm(COPY, src, tmpData, len0); \ + VXC_DP4x4(tmpDst0, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tmpDst1, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvert2ndU8SubZpToFp32_4x4); \ + tmpDst0 *= input_scale; \ + tmpDst1 *= input_scale; \ + vstore4(tmpDst0, 0, tmp_ref_ptr + loc2); \ + vstore4(tmpDst1, 1, tmp_ref_ptr + loc2); \ + } \ + __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \ + for(int i = gidx; i < res; i += get_global_size(0)) \ + { \ + ptr2 tmpData1 = input_ptr1[length + i]; \ + _viv_asm(COPY, src, tmpData1, 4); \ + VXC_DP4x4(tmpDst0, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + tmp_ref_ptr[length + i] = tmpDst0.x; \ + } \ +} +SCATTER_REDUCTION_PREPROCESS(U8, vxc_uchar8, vxc_uchar8, 8, 1, uchar) +SCATTER_REDUCTION_PREPROCESS(I8, vxc_char8, vxc_char8, 8, 1, char) +SCATTER_REDUCTION_PREPROCESS(I16, vxc_short8, vxc_short8, 16, 2, short) +SCATTER_REDUCTION_PREPROCESS(F16, vxc_short8, vxc_half8, 16, 2, short) + +#define SCATTER_ND_REDUCTION_PROCESS_F16(name0, func) \ +__kernel void scatter_nd_update_reduction_##name0##_F16( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + image2d_t temp_buf_float, \ + image2d_t link_buffer0, \ + int width, int area, int vol, int val4, \ + int val5, int val6, int val7, int coord_dim) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(update, 2); \ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global short* update_ptr = (__global short*)img2.ptr; \ + __global float* output_ptr = (__global float*)img3.ptr; \ + half src; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + short tmpData = update_ptr[gidy * update_width + gidx]; \ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \ + int loc = idx * output_width + gidx; \ + _viv_asm(COPY, src, tmpData, 4); \ + float data; \ + _viv_asm(CONV, data, src); \ + func(output_ptr + loc, data); \ +} +SCATTER_ND_REDUCTION_PROCESS_F16(Add, AtomicAdd_float) +SCATTER_ND_REDUCTION_PROCESS_F16(Mul, AtomicMul_float) +SCATTER_ND_REDUCTION_PROCESS_F16(Max, AtomicMax_float) +SCATTER_ND_REDUCTION_PROCESS_F16(Min, AtomicMin_float) + +#define SCATTER_ND_REDUCTION_PROCESS_BF16(name0, func) \ +__kernel void scatter_nd_update_reduction_##name0##_BF16( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + image2d_t temp_buf_float, \ + image2d_t link_buffer0, \ + int width, int area, int vol, int val4, \ + int val5, int val6, int val7, int coord_dim) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(update, 2); \ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global short* update_ptr = (__global short*)img2.ptr; \ + __global float* output_ptr = (__global float*)img3.ptr; \ + half src; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + short tmpData = update_ptr[gidy * update_width + gidx]; \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + vxc_short8 src0, src1; \ + float data; \ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \ + int loc = idx * output_width + gidx; \ + _viv_asm(COPY, src0, tmpData, 4); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data, src1, 4); \ + func(output_ptr + loc, data); \ +} +SCATTER_ND_REDUCTION_PROCESS_BF16(Add, AtomicAdd_float) +SCATTER_ND_REDUCTION_PROCESS_BF16(Mul, AtomicMul_float) +SCATTER_ND_REDUCTION_PROCESS_BF16(Max, AtomicMax_float) +SCATTER_ND_REDUCTION_PROCESS_BF16(Min, AtomicMin_float) + +#define SCATTER_ND_UPDATE_PROCESS_QINT(name0, src0_type, data_type, ptr_type, element_size, func) \ +__kernel void scatter_nd_update_reduction_##name0##_##src0_type( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + image2d_t temp_buf_float, \ + image2d_t link_buffer0, \ + int width, int area, int vol, int val4, \ + int val5, int val6, int val7, int coord_dim) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(update, element_size); \ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \ + __global float* output_ptr = (__global float*)img3.ptr; \ + data_type src; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \ + short zp = update_zp; \ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \ + int loc = idx * output_width + gidx; \ + _viv_asm(COPY, src, tmpData, 4); \ + vxc_float4 data; \ + VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + data.x *= update_scale; \ + func(output_ptr + loc, data.x); \ +} +SCATTER_ND_UPDATE_PROCESS_QINT(Add, U8, vxc_uchar8, uchar, 1, AtomicAdd_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, U8, vxc_uchar8, uchar, 1, AtomicMul_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Max, U8, vxc_uchar8, uchar, 1, AtomicMax_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Min, U8, vxc_uchar8, uchar, 1, AtomicMin_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Add, I8, vxc_char8, char, 1, AtomicAdd_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I8, vxc_char8, char, 1, AtomicMul_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Max, I8, vxc_char8, char, 1, AtomicMax_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Min, I8, vxc_char8, char, 1, AtomicMin_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Add, I16, vxc_short8, short, 2, AtomicAdd_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I16, vxc_short8, short, 2, AtomicMul_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Max, I16, vxc_short8, short, 2, AtomicMax_float) +SCATTER_ND_UPDATE_PROCESS_QINT(Min, I16, vxc_short8, short, 2, AtomicMin_float) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction_conv.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction_conv.vx new file mode 100644 index 0000000..e027a2f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction_conv.vx @@ -0,0 +1,110 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +_viv_uniform VXC_512Bits uniExtractOddData_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniExtractHalf8_2x8; + +#define SCATTER_ND_UPDATE_CONV(src0_type, ptr_type, element_size, ptr_type1, conv_func) \ +__kernel void scatter_nd_update_reduction_conv_##src0_type( \ + __read_only image2d_t temp_buf_float, \ + __read_only image2d_t link_buf, \ + image2d_t output, \ + int length, int res) \ +{ \ + int gidx = get_global_id(0); \ + Image img1 = create_image_from_image2d(temp_buf_float, 4); \ + Image img2 = create_image_from_image2d(output, element_size); \ + __global float* input_ptr = (__global float*)img1.ptr; \ + if(length > 0) \ + { \ + __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \ + float4 src0 = vload4(0, input_ptr + gidx * 8); \ + float4 src1 = vload4(1, input_ptr + gidx * 8); \ + int4 data0 = convert_int4_rte(src0 * output_scale + output_zp); \ + int4 data1 = convert_int4_rte(src1 * output_scale + output_zp); \ + ptr_type dst; \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + output_ptr[gidx] = dst; \ + } \ + __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \ + for(int i = gidx; i < res; i += get_global_size(0)) \ + { \ + float src = input_ptr[length + i]; \ + int data = convert_int_rte(src * output_scale + output_zp); \ + output_ptr1[length + i] = conv_func(data); \ + } \ +} +SCATTER_ND_UPDATE_CONV(U8, vxc_uchar8, 1, uchar, convert_uchar) +SCATTER_ND_UPDATE_CONV(I8, vxc_char8, 1, char, convert_char) +SCATTER_ND_UPDATE_CONV(I16, vxc_short8, 2, short, convert_short) + +__kernel void scatter_nd_update_reduction_conv_F16( + __read_only image2d_t temp_buf_float, + __read_only image2d_t link_buf, + image2d_t output, + int length, int res) +{ + int gidx = get_global_id(0); + Image img1 = create_image_from_image2d(temp_buf_float, 4); + Image img2 = create_image_from_image2d(output, 2); + __global float* input_ptr = (__global float*)img1.ptr; + if(length > 0) + { + __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr; + float4 src0 = vload4(0, input_ptr + gidx * 8); + float4 src1 = vload4(1, input_ptr + gidx * 8); + half4 data0, data1; + _viv_asm(CONV, data0, src0); + _viv_asm(CONV, data1, src1); + vxc_half8 tmp; + vxc_short8 dst; + VXC_DP2x8(tmp, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractHalf8_2x8); + _viv_asm(COPY, dst, tmp, 16); + output_ptr[gidx] = dst; + } + __global short* output_ptr1 = (__global short*)img2.ptr; + for(int i = gidx; i < res; i += get_global_size(0)) + { + float src = input_ptr[length + i]; + half data; + _viv_asm(CONV, data, src); + short dst; + _viv_asm(COPY, dst, data, 4); + output_ptr1[length + i] = dst; + } +} + +__kernel void scatter_nd_update_reduction_conv_BF16( + __read_only image2d_t temp_buf_float, + __read_only image2d_t link_buf, + image2d_t output, + int length, int res) +{ + int gidx = get_global_id(0); + Image img1 = create_image_from_image2d(temp_buf_float, 4); + Image img2 = create_image_from_image2d(output, 2); + __global float* input_ptr = (__global float*)img1.ptr; + if(length > 0) + { + __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr; + float4 src0 = vload4(0, input_ptr + gidx * 8); + float4 src1 = vload4(1, input_ptr + gidx * 8); + vxc_short8 dst0, dst1, dst; + _viv_asm(COPY, dst0, src0, 16); + _viv_asm(COPY, dst1, src1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + output_ptr[gidx] = dst; + } + __global short* output_ptr1 = (__global short*)img2.ptr; + for(int i = gidx; i < res; i += get_global_size(0)) + { + float src = input_ptr[length + i]; + vxc_short8 data; + _viv_asm(COPY, data, src, 4); + output_ptr1[length + i] = data.x; + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx index 8f47577..c9dda26 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx @@ -110,10 +110,10 @@ do\ #define VXC_VertMax3_Integer(dst, src0, src1, src2, info)\ do\ {\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ - int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + constant int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\ typeof (dst) tmp;\ tmp = max(src0, src1);\ tmp = max(src2, tmp);\ @@ -138,10 +138,10 @@ do\ #define VXC_HorzMax3_Integer(dst, src0, info)\ do\ {\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ - int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + constant int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\ VXC_OP4(filter, dst, src0, src0, src0, mod1);\ } while (0) @@ -149,12 +149,12 @@ do\ #define VXC_HorzMax3_Half(dst, src0, info)\ do\ {\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ - int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ - int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\ - int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + constant int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ + constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\ + constant int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\ vxc_short8 val0, minVal, maxVal;\ _viv_asm(COPY, val0, src0, 16);\ VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\ @@ -166,24 +166,24 @@ do\ #define VXC_HorzMin3_Integer(dst, src0, info)\ do\ {\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ - int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ - int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + constant int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ + constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\ VXC_OP4(filter, dst, src0, src0, src0, mod1);\ } while (0) #define VXC_HorzMin3_Half(dst, src0, info)\ do\ {\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ - int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ - int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\ - int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\ - int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\ + constant int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\ + constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\ + constant int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\ + constant int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\ vxc_short8 val0, minVal, maxVal, midVal;\ _viv_asm(COPY, val0, src0, 16);\ VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\ diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index dd10737..68763cc 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -4242,6 +4242,557 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ \n\ "; /* end of conv1d_ovxlib_k1024_vx*/ +static const char crop_and_resize_bilinear_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float inOutScale;\n\ +_viv_uniform float inOutTile;\n\ +_viv_uniform float width_scale;\n\ +_viv_uniform float height_scale;\n\ +_viv_uniform int image_width;\n\ +_viv_uniform int image_height;\n\ +_viv_uniform VXC_512Bits uniRightToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniLeftToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Bit_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +\n\ +#define CROP_AND_RESIZE_PART0 \\\n\ + int i = 0; \\\n\ + int bb = get_global_id(2); \\\n\ + int y = get_global_id(1); \\\n\ + int4 x = (int4)(get_global_id(0),get_global_id(0) + 1, get_global_id(0) + 2, get_global_id(0) + 3); \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int2 coord_box_ind = (int2)(bb, 0); \\\n\ + int b = read_imagei(box_ind, coord_box_ind).x; \\\n\ + float4 xy, in_x; \\\n\ + float in_y; \\\n\ + float4 x_lerp, y_lerp; \\\n\ + int d = 0; \\\n\ + Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\ + xy = vload_half4(bb, boxes_ptr); \\\n\ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\ + \\\n\ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\ + in_y = xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale; \\\n\ + y_lerp.x = in_y - floor(in_y); \\\n\ + y_lerp.yzw = y_lerp.xxx;\n\ +\n\ +#define CROP_AND_RESIZE_PART1 \\\n\ + int4 coord = (int4)(0, in_y, d + b * ori_depth, 0); \\\n\ + int8 input_desc, output_desc; \\\n\ + \\\n\ + coord_out.z = d + coord_out.z * ori_depth; \\\n\ + \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.w, baseAddr); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + in_x.x = xy.y * convert_float(image_width - 1); \\\n\ + in_x.yzw = in_x.xxx; \\\n\ + in_x = in_x + convert_float4(x) * _width_scale; \\\n\ + x_lerp = in_x - floor(in_x); \\\n\ + coord.x = floor(in_x.x); \\\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x = floor(in_x.y); \\\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x = floor(in_x.z); \\\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x = floor(in_x.w); \\\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(name,src_type,dst_type) \\\n\ +__kernel void crop_and_resize_bilinear_##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout \\\n\ +) \\\n\ +{ \\\n\ + CROP_AND_RESIZE_PART0; \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + src_type src0, src1; \\\n\ + CROP_AND_RESIZE_PART1; \\\n\ + \\\n\ + float4 top, bottom, value; \\\n\ + float4 top_left4,top_right4,bottom_left4,bottom_right4; \\\n\ + dst_type data; \\\n\ + int4 tmpout; \\\n\ + \\\n\ + VXC_DP4x4(top_left4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\ + VXC_DP4x4(top_right4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\ + VXC_DP4x4(bottom_left4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\ + VXC_DP4x4(bottom_right4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\ + \\\n\ + top = top_left4 + (top_right4 - top_left4) * x_lerp; \\\n\ + bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \\\n\ + value = top + (bottom - top) * y_lerp; \\\n\ + value = value * inOutScale + inOutTile; \\\n\ + _viv_asm(CONV, tmpout, value); \\\n\ + VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(U8toU8,vxc_uchar8, vxc_uchar4)\n\ +CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(I8toI8,vxc_char8, vxc_char4)\n\ +\n\ +#define CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(name,src_type,dst_type,tmp_type) \\\n\ +__kernel void crop_and_resize_bilinear_##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout \\\n\ +) \\\n\ +{ \\\n\ + CROP_AND_RESIZE_PART0; \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + vxc_short8 src0, src1; \\\n\ + src_type src0_temp, src1_temp; \\\n\ + CROP_AND_RESIZE_PART1; \\\n\ + \\\n\ + _viv_asm(COPY, src0_temp, src0, 16); \\\n\ + _viv_asm(COPY, src1_temp, src1, 16); \\\n\ + float4 top, bottom, value; \\\n\ + float4 top_left4,top_right4,bottom_left4,bottom_right4; \\\n\ + dst_type data; \\\n\ + vxc_short4 out; \\\n\ + tmp_type tmpout; \\\n\ + \\\n\ + VXC_DP4x4(top_left4, src0_temp, src0_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\ + VXC_DP4x4(top_right4, src0_temp, src0_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\ + VXC_DP4x4(bottom_left4, src1_temp, src1_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\ + VXC_DP4x4(bottom_right4, src1_temp, src1_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\ + \\\n\ + top = top_left4 + (top_right4 - top_left4) * x_lerp; \\\n\ + bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \\\n\ + value = top + (bottom - top) * y_lerp; \\\n\ + value = value * inOutScale + inOutTile; \\\n\ + _viv_asm(CONV, tmpout, value); \\\n\ + VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\ + _viv_asm(COPY, out, data, 8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(I16toI16, vxc_short8, vxc_short4, short4)\n\ +CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(I16toF16, vxc_short8, vxc_half4, half4)\n\ +CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(F16toF16, vxc_half8, vxc_half4, half4)\n\ +CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(F16toI16, vxc_half8, vxc_short4, short4)\n\ +\n\ +#define CROP_AND_RESIZE_BILINEAR_F16toQuant8(name,dst_type) \\\n\ +__kernel void crop_and_resize_bilinear_F16to##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout \\\n\ +) \\\n\ +{ \\\n\ + CROP_AND_RESIZE_PART0; \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + vxc_short8 src0, src1; \\\n\ + vxc_half8 src0_temp, src1_temp; \\\n\ + CROP_AND_RESIZE_PART1; \\\n\ + \\\n\ + _viv_asm(COPY, src0_temp, src0, 16); \\\n\ + _viv_asm(COPY, src1_temp, src1, 16); \\\n\ + float4 top, bottom, value; \\\n\ + float4 top_left4,top_right4,bottom_left4,bottom_right4; \\\n\ + dst_type data; \\\n\ + int4 tmpout; \\\n\ + \\\n\ + VXC_DP4x4(top_left4, src0_temp, src0_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\ + VXC_DP4x4(top_right4, src0_temp, src0_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\ + VXC_DP4x4(bottom_left4, src1_temp, src1_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\ + VXC_DP4x4(bottom_right4, src1_temp, src1_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\ + \\\n\ + top = top_left4 + (top_right4 - top_left4) * x_lerp; \\\n\ + bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \\\n\ + value = top + (bottom - top) * y_lerp; \\\n\ + value = value * inOutScale + inOutTile; \\\n\ + _viv_asm(CONV, tmpout, value); \\\n\ + VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CROP_AND_RESIZE_BILINEAR_F16toQuant8(U8, vxc_uchar4)\n\ +CROP_AND_RESIZE_BILINEAR_F16toQuant8(I8, vxc_char4)\n\ +\n\ +#define CROP_AND_RESIZE_BILINEAR_Quant8toF16(name,src_type) \\\n\ +__kernel void crop_and_resize_bilinear_##name##toF16 \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout \\\n\ +) \\\n\ +{ \\\n\ + CROP_AND_RESIZE_PART0; \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + src_type src0, src1; \\\n\ + CROP_AND_RESIZE_PART1; \\\n\ + \\\n\ + float4 top, bottom, value; \\\n\ + float4 top_left4,top_right4,bottom_left4,bottom_right4; \\\n\ + vxc_half4 data; \\\n\ + vxc_short4 out; \\\n\ + half4 tmpout; \\\n\ + \\\n\ + VXC_DP4x4(top_left4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\ + VXC_DP4x4(top_right4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\ + VXC_DP4x4(bottom_left4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\ + VXC_DP4x4(bottom_right4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\ + \\\n\ + top = top_left4 + (top_right4 - top_left4) * x_lerp; \\\n\ + bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \\\n\ + value = top + (bottom - top) * y_lerp; \\\n\ + value = value * inOutScale + inOutTile; \\\n\ + _viv_asm(CONV, tmpout, value); \\\n\ + VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\ + _viv_asm(COPY, out, data, 8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CROP_AND_RESIZE_BILINEAR_Quant8toF16(U8, vxc_uchar8)\n\ +CROP_AND_RESIZE_BILINEAR_Quant8toF16(I8, vxc_char8)\n\ +\n\ +\n\ +"; /* end of crop_and_resize_bilinear_vx*/ + +static const char crop_and_resize_nearest_neighbor_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float inOutScale;\n\ +_viv_uniform float inOutTile;\n\ +_viv_uniform float width_scale;\n\ +_viv_uniform float height_scale;\n\ +_viv_uniform int image_width;\n\ +_viv_uniform int image_height;\n\ +_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Bit_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +\n\ +#define IMG_LOAD(src_type) \\\n\ + src_type src; \\\n\ + int4 coord = (int4)(0, in_y, d + b * ori_depth, 0); \\\n\ + int8 input_desc, output_desc; \\\n\ + \\\n\ + coord_out.z = d + coord_out.z * ori_depth; \\\n\ + \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.w, baseAddr); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x) * _width_scale)); \\\n\ + coord.x = in_x; \\\n\ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 1) * _width_scale)); \\\n\ + coord.x = in_x; \\\n\ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 2) * _width_scale)); \\\n\ + coord.x = in_x; \\\n\ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 3) * _width_scale)); \\\n\ + coord.x = in_x; \\\n\ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 4) * _width_scale)); \\\n\ + coord.x = in_x; \\\n\ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 5) * _width_scale)); \\\n\ + coord.x = in_x; \\\n\ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 6) * _width_scale)); \\\n\ + coord.x = in_x; \\\n\ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 7) * _width_scale)); \\\n\ + coord.x = in_x; \\\n\ + VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +\n\ +#define CROP_AND_RESIZE_Quant8toQuant8(name, data_type) \\\n\ +__kernel void crop_and_resize_nearest_neighbor_##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout \\\n\ +) \\\n\ +{ \\\n\ + int bb = get_global_id(2); \\\n\ + int y = get_global_id(1); \\\n\ + int x = get_global_id(0); \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int2 coord_box_ind = (int2)(bb, 0); \\\n\ + int b = read_imagei(box_ind, coord_box_ind).x; \\\n\ + float4 xy; \\\n\ + int in_x, in_y; \\\n\ + int d = 0; \\\n\ + Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\ + xy = vload_half4(bb, boxes_ptr); \\\n\ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\ + \\\n\ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \\\n\ + \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + data_type data; \\\n\ + int4 tmpout0, tmpout1; \\\n\ + float4 tmpdata0, tmpdata1; \\\n\ + IMG_LOAD(data_type); \\\n\ + \\\n\ + VXC_DP4x4(tmpdata0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\ + VXC_DP4x4(tmpdata1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\ + \\\n\ + tmpdata0 = tmpdata0 * inOutScale + inOutTile; \\\n\ + tmpdata1 = tmpdata1 * inOutScale + inOutTile; \\\n\ + _viv_asm(CONV, tmpout0, tmpdata0); \\\n\ + _viv_asm(CONV, tmpout1, tmpdata1); \\\n\ + \\\n\ + VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\ + \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CROP_AND_RESIZE_Quant8toQuant8(U8toU8, vxc_uchar8)\n\ +CROP_AND_RESIZE_Quant8toQuant8(I8toI8, vxc_char8)\n\ +\n\ +#define CROP_AND_RESIZE_Quant8toF16(name, src_type) \\\n\ +__kernel void crop_and_resize_nearest_neighbor_##name##toF16 \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout \\\n\ +) \\\n\ +{ \\\n\ + int bb = get_global_id(2); \\\n\ + int y = get_global_id(1); \\\n\ + int x = get_global_id(0); \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int2 coord_box_ind = (int2)(bb, 0); \\\n\ + int b = read_imagei(box_ind, coord_box_ind).x; \\\n\ + float4 xy; \\\n\ + int in_x, in_y; \\\n\ + int d = 0; \\\n\ + Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\ + xy = vload_half4(bb, boxes_ptr); \\\n\ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\ + \\\n\ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \\\n\ + \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + vxc_short8 out; \\\n\ + vxc_half8 data; \\\n\ + half4 tmpout0, tmpout1; \\\n\ + float4 tmpdata0, tmpdata1; \\\n\ + IMG_LOAD(src_type); \\\n\ + \\\n\ + VXC_DP4x4(tmpdata0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\ + VXC_DP4x4(tmpdata1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\ + \\\n\ + tmpdata0 = tmpdata0 * inOutScale + inOutTile; \\\n\ + tmpdata1 = tmpdata1 * inOutScale + inOutTile; \\\n\ + _viv_asm(CONV, tmpout0, tmpdata0); \\\n\ + _viv_asm(CONV, tmpout1, tmpdata1); \\\n\ + \\\n\ + VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\ + _viv_asm(COPY, out, data, 16); \\\n\ + \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CROP_AND_RESIZE_Quant8toF16(U8, vxc_uchar8)\n\ +CROP_AND_RESIZE_Quant8toF16(I8, vxc_char8)\n\ +\n\ +#define CROP_AND_RESIZE_NEAREST_F16toQuant8(name, dst_type) \\\n\ +__kernel void crop_and_resize_nearest_neighbor_F16to##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout \\\n\ +) \\\n\ +{ \\\n\ + int bb = get_global_id(2); \\\n\ + int y = get_global_id(1); \\\n\ + int x = get_global_id(0); \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int2 coord_box_ind = (int2)(bb, 0); \\\n\ + int b = read_imagei(box_ind, coord_box_ind).x; \\\n\ + float4 xy; \\\n\ + int in_x, in_y; \\\n\ + int d = 0; \\\n\ + Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\ + xy = vload_half4(bb, boxes_ptr); \\\n\ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\ + \\\n\ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \\\n\ + \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + dst_type data; \\\n\ + int4 tmpout0, tmpout1; \\\n\ + float4 tmpdata0, tmpdata1; \\\n\ + IMG_LOAD(vxc_short8); \\\n\ + vxc_half8 src_half; \\\n\ + _viv_asm(COPY, src_half, src, 16); \\\n\ + \\\n\ + VXC_DP4x4(tmpdata0, src_half, src_half, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\ + VXC_DP4x4(tmpdata1, src_half, src_half, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\ + \\\n\ + tmpdata0 = tmpdata0 * inOutScale + inOutTile; \\\n\ + tmpdata1 = tmpdata1 * inOutScale + inOutTile; \\\n\ + _viv_asm(CONV, tmpout0, tmpdata0); \\\n\ + _viv_asm(CONV, tmpout1, tmpdata1); \\\n\ + \\\n\ + VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\ + \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CROP_AND_RESIZE_NEAREST_F16toQuant8(U8, vxc_uchar8)\n\ +CROP_AND_RESIZE_NEAREST_F16toQuant8(I8, vxc_char8)\n\ +\n\ +#define CROP_AND_RESIZE_16Bitsto16Bits(name,src_type,dst_type,temp_type) \\\n\ +__kernel void crop_and_resize_nearest_neighbor_##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout \\\n\ +) \\\n\ +{ \\\n\ + int bb = get_global_id(2); \\\n\ + int y = get_global_id(1); \\\n\ + int x = get_global_id(0); \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int2 coord_box_ind = (int2)(bb, 0); \\\n\ + int b = read_imagei(box_ind, coord_box_ind).x; \\\n\ + float4 xy; \\\n\ + int in_x, in_y; \\\n\ + int d = 0; \\\n\ + Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\ + xy = vload_half4(bb, boxes_ptr); \\\n\ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\ + \\\n\ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \\\n\ + \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + vxc_short8 out; \\\n\ + dst_type data; \\\n\ + temp_type tmpout0, tmpout1; \\\n\ + float4 tmpdata0, tmpdata1; \\\n\ + IMG_LOAD(vxc_short8); \\\n\ + src_type src_temp; \\\n\ + _viv_asm(COPY, src_temp, src, 16); \\\n\ + \\\n\ + VXC_DP4x4(tmpdata0, src_temp, src_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\ + VXC_DP4x4(tmpdata1, src_temp, src_temp, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\ + \\\n\ + _viv_asm(CONV, tmpout0, tmpdata0); \\\n\ + _viv_asm(CONV, tmpout1, tmpdata1); \\\n\ + \\\n\ + VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\ + _viv_asm(COPY, out, data, 16); \\\n\ + \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CROP_AND_RESIZE_16Bitsto16Bits \\\n\ +(F16toF16, vxc_half8, vxc_half8, half4)\n\ +CROP_AND_RESIZE_16Bitsto16Bits \\\n\ +(F16toI16, vxc_half8, vxc_short8, short4)\n\ +CROP_AND_RESIZE_16Bitsto16Bits \\\n\ +(I16toF16, vxc_short8, vxc_half8, half4)\n\ +CROP_AND_RESIZE_16Bitsto16Bits \\\n\ +(I16toI16, vxc_short8, vxc_short8,short4)\n\ +"; /* end of crop_and_resize_nearest_neighbor_vx*/ + static const char cumsum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ @@ -9243,6 +9794,11 @@ float4 eltwise_unary_acosh(float4 val)\n\ return acosh(val);\n\ }\n\ \n\ +float4 eltwise_unary_tan(float4 val)\n\ +{\n\ + return native_tan(val);\n\ +}\n\ +\n\ _viv_uniform float inputScale;\n\ _viv_uniform float inputTail;\n\ _viv_uniform float outputScale;\n\ @@ -9360,7 +9916,8 @@ ADD_ELTSISE_UNARY_2D(atan)\n\ ADD_ELTSISE_UNARY_2D(atanh)\n\ //ACOSH\n\ ADD_ELTSISE_UNARY_2D(acosh)\n\ -\n\ +//TAN\n\ +ADD_ELTSISE_UNARY_2D(tan)\n\ "; /* end of eltwise_unary_2d_1_vx*/ static const char eltwise_unary_3d_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -9714,6 +10271,11 @@ float4 eltwise_unary_acosh(float4 val)\n\ return acosh(val);\n\ }\n\ \n\ +float4 eltwise_unary_tan(float4 val)\n\ +{\n\ + return native_tan(val);\n\ +}\n\ +\n\ _viv_uniform float inputScale;\n\ _viv_uniform float inputTail;\n\ _viv_uniform float outputScale;\n\ @@ -9830,6 +10392,8 @@ ADD_ELTSISE_UNARY_3D(atan)\n\ ADD_ELTSISE_UNARY_3D(atanh)\n\ //ACOSH\n\ ADD_ELTSISE_UNARY_3D(acosh)\n\ +//TAN\n\ +ADD_ELTSISE_UNARY_3D(tan)\n\ "; /* end of eltwise_unary_3d_1_vx*/ static const char erf_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -10262,7 +10826,8 @@ __kernel void gather_I8toI8(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10287,7 +10852,8 @@ __kernel void gather_U8toU8(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10312,7 +10878,8 @@ __kernel void gather_I16toI16(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10338,7 +10905,8 @@ __kernel void gather_F16toF16(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10363,7 +10931,8 @@ __kernel void gather_I8toI8_axis0(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ @@ -10390,7 +10959,8 @@ __kernel void gather_U8toU8_axis0(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ @@ -10417,7 +10987,8 @@ __kernel void gather_I16toI16_axis0(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ @@ -10444,7 +11015,8 @@ __kernel void gather_F16toF16_axis0(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ @@ -10479,7 +11051,8 @@ __kernel void gather_I8toI8_array(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10493,13 +11066,29 @@ __kernel void gather_I8toI8_array(\n\ \n\ Image img1 = create_image_from_image2d(input0, 1);\n\ Image img2 = create_image_from_image2d(output, 1);\n\ - uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ - __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr;\n\ - vxc_char16 src = data_ptr[0];\n\ +\n\ int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ +\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ - __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr;\n\ - dst_ptr[0] = src;\n\ +\n\ + if (gidx == ((block_size >> 4) * 16))\n\ + {\n\ + __global char* data_ptr = (__global char*)input_ptr;\n\ + __global char* dst_ptr = (__global char*)output_ptr;\n\ + int i = 0;\n\ + for (i = 0; i < block_size - gidx; i ++)\n\ + {\n\ + dst_ptr[i] = data_ptr[i];\n\ + }\n\ + }\n\ + else\n\ + {\n\ + __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr;\n\ + vxc_char16 src = data_ptr[0];\n\ + __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr;\n\ + dst_ptr[0] = src;\n\ + }\n\ }\n\ \n\ __kernel void gather_U8toU8_array(\n\ @@ -10508,7 +11097,8 @@ __kernel void gather_U8toU8_array(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10522,13 +11112,29 @@ __kernel void gather_U8toU8_array(\n\ \n\ Image img1 = create_image_from_image2d(input0, 1);\n\ Image img2 = create_image_from_image2d(output, 1);\n\ - uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ - __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr;\n\ - vxc_uchar16 src = data_ptr[0];\n\ +\n\ int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ +\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ - __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr;\n\ - dst_ptr[0] = src;\n\ +\n\ + if (gidx == ((block_size >> 4) * 16))\n\ + {\n\ + __global uchar* data_ptr = (__global uchar*)input_ptr;\n\ + __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\ + int i = 0;\n\ + for (i = 0; i < block_size - gidx; i ++)\n\ + {\n\ + dst_ptr[i] = data_ptr[i];\n\ + }\n\ + }\n\ + else\n\ + {\n\ + __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr;\n\ + vxc_uchar16 src = data_ptr[0];\n\ + __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr;\n\ + dst_ptr[0] = src;\n\ + }\n\ }\n\ \n\ __kernel void gather_I16toI16_array(\n\ @@ -10537,7 +11143,8 @@ __kernel void gather_I16toI16_array(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10552,13 +11159,29 @@ __kernel void gather_I16toI16_array(\n\ \n\ Image img1 = create_image_from_image2d(input0, 2);\n\ Image img2 = create_image_from_image2d(output, 2);\n\ - uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ - __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\ - vxc_short8 src = data_ptr[0];\n\ +\n\ int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ +\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ - __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\ - dst_ptr[0] = src;\n\ +\n\ + if (gidx == ((block_size >> 3) * 8))\n\ + {\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + int i = 0;\n\ + for (i = 0; i < block_size - gidx; i ++)\n\ + {\n\ + dst_ptr[i] = data_ptr[i];\n\ + }\n\ + }\n\ + else\n\ + {\n\ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\ + vxc_short8 src = data_ptr[0];\n\ + __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\ + dst_ptr[0] = src;\n\ + }\n\ }\n\ \n\ __kernel void gather_F16toF16_array(\n\ @@ -10567,7 +11190,8 @@ __kernel void gather_F16toF16_array(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10582,13 +11206,29 @@ __kernel void gather_F16toF16_array(\n\ \n\ Image img1 = create_image_from_image2d(input0, 2);\n\ Image img2 = create_image_from_image2d(output, 2);\n\ - uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ - __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\ - vxc_short8 src = data_ptr[0];\n\ +\n\ int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ +\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ - __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\ - dst_ptr[0] = src;\n\ +\n\ + if (gidx == ((block_size >> 3) * 8))\n\ + {\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + int i = 0;\n\ + for (i = 0; i < block_size - gidx; i ++)\n\ + {\n\ + dst_ptr[i] = data_ptr[i];\n\ + }\n\ + }\n\ + else\n\ + {\n\ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\ + vxc_short8 src = data_ptr[0];\n\ + __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\ + dst_ptr[0] = src;\n\ + }\n\ }\n\ \n\ #define GATHER_AXIS0_ARRAY(src0_type_name, read_type, data_type, write_type) \\\n\ @@ -10598,7 +11238,8 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ - int axis_num \\\n\ + int axis_num, \\\n\ + int is_array \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ @@ -10664,7 +11305,8 @@ __kernel void gather_batch_I8toI8(\n\ __write_only image2d_array_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10695,7 +11337,8 @@ __kernel void gather_batch_U8toU8(\n\ __write_only image2d_array_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10726,7 +11369,8 @@ __kernel void gather_batch_I16toI16(\n\ __write_only image2d_array_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10757,7 +11401,8 @@ __kernel void gather_batch_F16toF16(\n\ __write_only image2d_array_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -10788,7 +11433,8 @@ __kernel void gather_batch_I8toI8_axis0(\n\ __write_only image2d_array_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ @@ -10817,7 +11463,8 @@ __kernel void gather_batch_U8toU8_axis0(\n\ __write_only image2d_array_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ @@ -10846,7 +11493,8 @@ __kernel void gather_batch_I16toI16_axis0(\n\ __write_only image2d_array_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ @@ -10875,7 +11523,8 @@ __kernel void gather_batch_F16toF16_axis0(\n\ __write_only image2d_array_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ @@ -11215,7 +11864,8 @@ __kernel void gather_##src0_type_name##toF16( \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ - int axis_num \\\n\ + int axis_num, \\\n\ + int is_array \\\n\ ) \\\n\ { \\\n\ int gidx = get_global_id(0); \\\n\ @@ -11252,7 +11902,8 @@ __kernel void gather_F16to##src1_type_name( \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ - int axis_num \\\n\ + int axis_num, \\\n\ + int is_array \\\n\ ) \\\n\ { \\\n\ int gidx = get_global_id(0); \\\n\ @@ -11285,7 +11936,8 @@ __kernel void gather_I16toF16(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -11320,7 +11972,8 @@ __kernel void gather_##src0_type_name##toF16_axis0( \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ - int axis_num \\\n\ + int axis_num, \\\n\ + int is_array \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -11352,7 +12005,8 @@ __kernel void gather_F16to##src1_type_name##_axis0( \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ - int axis_num \\\n\ + int axis_num, \\\n\ + int is_array \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -11384,7 +12038,8 @@ __kernel void gather_I16toF16_axis0(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ @@ -11429,7 +12084,8 @@ __kernel void gather_batch_##src0_type_name##toF16( \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ - int axis_num \\\n\ + int axis_num, \\\n\ + int is_array \\\n\ ) \\\n\ { \\\n\ int gidx = get_global_id(0); \\\n\ @@ -11476,7 +12132,8 @@ __kernel void gather_batch_F16to##src1_type_name( \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ - int axis_num \\\n\ + int axis_num, \\\n\ + int is_array \\\n\ ) \\\n\ { \\\n\ int gidx = get_global_id(0); \\\n\ @@ -11517,7 +12174,8 @@ __kernel void gather_batch_I16toF16(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -11556,7 +12214,8 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ - int axis_num \\\n\ + int axis_num, \\\n\ + int is_array \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -11591,7 +12250,8 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \\\n\ __write_only image2d_t output, \\\n\ int block_size, \\\n\ int block_num, \\\n\ - int axis_num \\\n\ + int axis_num, \\\n\ + int is_array \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -11626,7 +12286,8 @@ __kernel void gather_batch_I16toF16_axis0(\n\ __write_only image2d_t output,\n\ int block_size,\n\ int block_num,\n\ - int axis_num\n\ + int axis_num,\n\ + int is_array\n\ )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ @@ -13891,6 +14552,9 @@ static const char grucell_activation_z_h_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ +_viv_uniform float output_scale1;\n\ +_viv_uniform float output_zp1;\n\ +\n\ float4 sigmoid_func(float4 x)\n\ {\n\ x *= -logE;\n\ @@ -14002,13 +14666,15 @@ __kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \\\n\ VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\ float4 result = (1 - z) * h + z * h_tm; \\\n\ - result = result * output_scale + output_zp; \\\n\ - int4 dst0; \\\n\ - _viv_asm(CONV_RTE, dst0, result); \\\n\ + float4 out0 = result * output_scale + output_zp; \\\n\ + float4 out1 = result * output_scale1 + output_zp1; \\\n\ + int4 dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, out0); \\\n\ + _viv_asm(CONV_RTE, dst1, out1); \\\n\ dst_type dst; \\\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(hstate_out, coord_in, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8)\n\ @@ -15371,6 +16037,11 @@ float4 tanh_func(float4 x)\n\ x = 1.0f / x;\n\ return 2 * x - 1;\n\ }\n\ +float4 relu_func(float4 x)\n\ +{\n\ + x = x > 0 ? x : 0;\n\ + return x;\n\ +}\n\ \n\ _viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;\n\ _viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;\n\ @@ -15434,6 +16105,8 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act }\n\ GRUCELL_F16_F16TOF16(TANH, tanh_func, SIGMOID, sigmoid_func)\n\ GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)\n\ +GRUCELL_F16_F16TOF16(RELU, relu_func, SIGMOID, sigmoid_func)\n\ +\n\ \n\ _viv_uniform float hstate_in_scale;\n\ _viv_uniform float hstate_in_tail;\n\ @@ -15499,6 +16172,10 @@ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func, GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8)\n\ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)\n\ +GRUCELL_QNT_F16TO_QNT(U8_F16toU8_RELU_SIGMOID, relu_func, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_QNT(I8_F16toI8_RELU_SIGMOID, relu_func, sigmoid_func, vxc_char8, vxc_char8)\n\ +GRUCELL_QNT_F16TO_QNT(I16_F16toI16_RELU_SIGMOID, relu_func, sigmoid_func, vxc_short8, vxc_short8)\n\ +\n\ \n\ #define GRUCELL_BF16(act_name, act_func, rec_act_name, rec_act_func) \\\n\ __kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_act_name( \\\n\ @@ -15561,6 +16238,7 @@ __kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_ }\n\ GRUCELL_BF16(TANH, tanh_func, SIGMOID, sigmoid_func)\n\ GRUCELL_BF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)\n\ +GRUCELL_BF16(RELU, relu_func, SIGMOID, sigmoid_func)\n\ "; /* end of grucell_reset_after_activation_vx*/ static const char hswish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -18930,6 +19608,1399 @@ __kernel void layer_norm_BF16F32toBF16_2D(\n\ }\n\ }"; /* end of layer_normalization_3_vx*/ +static const char layer_normalization_axis01_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform int height;\n\ +_viv_uniform uint group_num;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float inv_multiplier;\n\ +\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F16toF16(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + vxc_uchar16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= inv_multiplier;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ + coord_para.z = 0;\n\ + coord_para.w = 0;\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ + coord_para.y = coord.y; coord_bias.y = coord.y;\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + tmpData0 = tmpData0 - mean_vari.s0;\n\ + tmpData1 = tmpData1 - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtract8Data_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F32toF16(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + vxc_uchar16 src0;\n\ + vxc_short8 outval;\n\ + vxc_half8 scale_h, dst;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= inv_multiplier;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ + coord_para.z = 0;\n\ + coord_para.w = 0;\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ + coord_bias.y = coord.y;\n\ +\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + scale_f0 = read_imagef(scale, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + scale_f1 = read_imagef(scale, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + tmpData0 = tmpData0 - mean_vari.s0;\n\ + tmpData1 = tmpData1 - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtract8Data_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F16toU8(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + vxc_uchar16 src0, outval;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= inv_multiplier;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ + coord_para.z = 0;\n\ + coord_para.w = 0;\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ + coord_para.y = coord.y; coord_bias.y = coord.y;\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + tmpData0 = tmpData0 - mean_vari.s0;\n\ + tmpData1 = tmpData1 - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);\n\ +\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniExtract8Data_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F32toU8(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + vxc_uchar16 src0 , outval;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= inv_multiplier;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ + coord_para.z = 0;\n\ + coord_para.w = 0;\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ + coord_bias.y = coord.y;\n\ +\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + scale_f0 = read_imagef(scale, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + scale_f1 = read_imagef(scale, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + tmpData0 = tmpData0 - mean_vari.s0;\n\ + tmpData1 = tmpData1 - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);\n\ +\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniExtract8Data_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of layer_normalization_axis01_0_vx*/ + +static const char layer_normalization_axis01_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform int height;\n\ +_viv_uniform uint group_num;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float inv_multiplier;\n\ +\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F16toF16(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + vxc_char16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= inv_multiplier;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ + coord_para.z = 0;\n\ + coord_para.w = 0;\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ + coord_para.y = coord.y; coord_bias.y = coord.y;\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + tmpData0 = tmpData0 - mean_vari.s0;\n\ + tmpData1 = tmpData1 - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtract8Data_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F32toF16(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + vxc_char16 src0;\n\ + vxc_short8 outval;\n\ + vxc_half8 scale_h, dst;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= inv_multiplier;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ + coord_para.z = 0;\n\ + coord_para.w = 0;\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ + coord_bias.y = coord.y;\n\ +\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + scale_f0 = read_imagef(scale, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + scale_f1 = read_imagef(scale, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + tmpData0 = tmpData0 - mean_vari.s0;\n\ + tmpData1 = tmpData1 - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtract8Data_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F16toU8(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + vxc_char16 src0;\n\ + vxc_uchar16 outval;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= inv_multiplier;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ + coord_para.z = 0;\n\ + coord_para.w = 0;\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ + coord_para.y = coord.y; coord_bias.y = coord.y;\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + tmpData0 = tmpData0 - mean_vari.s0;\n\ + tmpData1 = tmpData1 - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);\n\ +\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniExtract8Data_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F32toU8(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ + int2 coord_sum = (int2)(0, gidz);\n\ + int4 coord_para = coord;\n\ + vxc_char16 src0;\n\ + vxc_uchar16 outval;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_sum);\n\ + coord_sum.x += 4;\n\ + }\n\ + mean_vari *= inv_multiplier;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ + coord_para.z = 0;\n\ + coord_para.w = 0;\n\ + int4 coord_bias = coord_para;\n\ +\n\ + int8 input_desc, scale_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ +\n\ + vxc_float4 tmpData0, tmpData1, norm;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y ++;\n\ + coord_bias.y = coord.y;\n\ +\n\ + bias_f0 = read_imagef(bias, coord_bias);\n\ + scale_f0 = read_imagef(scale, coord_bias);\n\ + coord_bias.x += 4;\n\ + bias_f1 = read_imagef(bias, coord_bias);\n\ + scale_f1 = read_imagef(scale, coord_bias);\n\ + coord_bias.x = coord.x;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4);\n\ + tmpData0 = tmpData0 - mean_vari.s0;\n\ + tmpData1 = tmpData1 - mean_vari.s0;\n\ +\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);\n\ +\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniExtract8Data_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of layer_normalization_axis01_1_vx*/ + +static const char layer_normalization_axis01_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform int height;\n\ +_viv_uniform uint group_num;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float inv_multiplier;\n\ +\n\ +#define LAYER_NORM_AXIS01_F16_F16to16Bits(name,temp_type,dst_type,conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F16to##name( \\\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\ + image2d_array_t output, float eps) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int2 coord_sum = (int2)(0, gidz); \\\n\ + int4 coord_para = coord; \\\n\ + vxc_short8 src0; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h, in_h; \\\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_sum); \\\n\ + coord_sum.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + coord_para.z = 0; \\\n\ + coord_para.w = 0; \\\n\ + int4 coord_bias = coord_para; \\\n\ + \\\n\ + int8 input_desc, scale_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \\\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \\\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr); \\\n\ + \\\n\ + vxc_float4 tmpData0, tmpData1; \\\n\ + vxc_short8 outval; \\\n\ + temp_type tmpVal0, tmpVal1; \\\n\ + dst_type dst; \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y ++; \\\n\ + coord_para.y = coord.y; \\\n\ + coord_bias.y = coord.y; \\\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x = coord.x; \\\n\ + \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4); \\\n\ + \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4); \\\n\ + \\\n\ + vxc_float4 sub, norm; \\\n\ + sub = tmpData0 - mean_vari.s0; \\\n\ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(conv_type, tmpVal0, norm); \\\n\ + sub = tmpData1 - mean_vari.s0; \\\n\ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(conv_type, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_AXIS01_F16_F16to16Bits(F16,half4,vxc_half8,CONV)\n\ +LAYER_NORM_AXIS01_F16_F16to16Bits(I16,int4,vxc_short8,CONV_RTE)\n\ +\n\ +\n\ +#define LAYER_NORM_AXIS01_F16_F32to16Bits(name,temp_type,dst_type,conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F32to##name( \\\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\ + image2d_array_t output, float eps) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int2 coord_sum = (int2)(0, gidz); \\\n\ + int4 coord_para = coord; \\\n\ + vxc_short8 src0; \\\n\ + vxc_half8 in_h; \\\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_sum); \\\n\ + coord_sum.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + coord_para.z = 0; \\\n\ + coord_para.w = 0; \\\n\ + int4 coord_bias = coord_para; \\\n\ + \\\n\ + int8 input_desc, scale_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr); \\\n\ + \\\n\ + vxc_float4 tmpData0, tmpData1; \\\n\ + vxc_short8 outval; \\\n\ + temp_type tmpVal0, tmpVal1; \\\n\ + dst_type dst; \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y ++; \\\n\ + coord_bias.y = coord.y; \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + scale_f0 = read_imagef(scale, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + scale_f1 = read_imagef(scale, coord_bias); \\\n\ + coord_bias.x = coord.x; \\\n\ + \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4); \\\n\ + \\\n\ + vxc_float4 sub, norm; \\\n\ + sub = tmpData0 - mean_vari.s0; \\\n\ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(conv_type, tmpVal0, norm); \\\n\ + sub = tmpData1 - mean_vari.s0; \\\n\ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(conv_type, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_AXIS01_F16_F32to16Bits(F16,half4,vxc_half8,CONV)\n\ +LAYER_NORM_AXIS01_F16_F32to16Bits(I16,int4,vxc_short8,CONV_RTE)\n\ +\n\ +#define LAYER_NORM_AXIS01_F16_F16toQUANT(name,dst_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F16to##name( \\\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\ + image2d_array_t output, float eps) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int2 coord_sum = (int2)(0, gidz); \\\n\ + int4 coord_para = coord; \\\n\ + vxc_short8 src0; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h, in_h; \\\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_sum); \\\n\ + coord_sum.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + coord_para.z = 0; \\\n\ + coord_para.w = 0; \\\n\ + int4 coord_bias = coord_para; \\\n\ + \\\n\ + int8 input_desc, scale_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \\\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \\\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr); \\\n\ + \\\n\ + vxc_float4 tmpData0, tmpData1; \\\n\ + dst_type outval; \\\n\ + vxc_int4 tmpVal0, tmpVal1; \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y ++; \\\n\ + coord_para.y = coord.y; \\\n\ + coord_bias.y = coord.y; \\\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x = coord.x; \\\n\ + \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4); \\\n\ + \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4); \\\n\ + \\\n\ + vxc_float4 sub, norm; \\\n\ + sub = tmpData0 - mean_vari.s0; \\\n\ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + sub = tmpData1 - mean_vari.s0; \\\n\ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniExtract8Data_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_AXIS01_F16_F16toQUANT(U8,vxc_uchar16)\n\ +LAYER_NORM_AXIS01_F16_F16toQUANT(I8,vxc_char16)\n\ +\n\ +#define LAYER_NORM_AXIS01_F16_F32toQUANT(name,dst_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F32to##name( \\\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\ + image2d_array_t output, float eps) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int2 coord_sum = (int2)(0, gidz); \\\n\ + int4 coord_para = coord; \\\n\ + vxc_short8 src0; \\\n\ + vxc_half8 in_h; \\\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_sum); \\\n\ + coord_sum.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + coord_para.z = 0; \\\n\ + coord_para.w = 0; \\\n\ + int4 coord_bias = coord_para; \\\n\ + \\\n\ + int8 input_desc, scale_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr); \\\n\ + \\\n\ + vxc_float4 tmpData0, tmpData1; \\\n\ + dst_type outval; \\\n\ + vxc_int4 tmpVal0, tmpVal1; \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y ++; \\\n\ + coord_bias.y = coord.y; \\\n\ + \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + scale_f0 = read_imagef(scale, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + scale_f1 = read_imagef(scale, coord_bias); \\\n\ + coord_bias.x = coord.x; \\\n\ + \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4); \\\n\ + \\\n\ + vxc_float4 sub, norm; \\\n\ + sub = tmpData0 - mean_vari.s0; \\\n\ + norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + sub = tmpData1 - mean_vari.s0; \\\n\ + norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniExtract8Data_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_AXIS01_F16_F32toQUANT(U8,vxc_uchar16)\n\ +LAYER_NORM_AXIS01_F16_F32toQUANT(I8,vxc_char16)"; /* end of layer_normalization_axis01_2_vx*/ + +static const char layer_normalization_axis01_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform int height;\n\ +_viv_uniform uint group_num;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float inv_multiplier;\n\ +\n\ +#define LAYER_NORM_AXIS01_I16_F16to16Bits(name,temp_type,dst_type,conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I16_F16to##name( \\\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\ + image2d_array_t output, float eps) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int2 coord_sum = (int2)(0, gidz); \\\n\ + int4 coord_para = coord; \\\n\ + vxc_short8 src0, src1; \\\n\ + vxc_half8 scale_h; \\\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_sum); \\\n\ + coord_sum.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + coord_para.z = 0; \\\n\ + coord_para.w = 0; \\\n\ + int4 coord_bias = coord_para; \\\n\ + \\\n\ + int8 input_desc, scale_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \\\n\ + int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \\\n\ + _viv_asm(MOV, coord_para.w, baseAddr_c); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr); \\\n\ + \\\n\ + vxc_float4 tmpData0, tmpData1, norm; \\\n\ + temp_type tmpVal0, tmpVal1; \\\n\ + vxc_short8 outval; \\\n\ + dst_type dst; \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y ++; \\\n\ + coord_para.y = coord.y; \\\n\ + coord_bias.y = coord.y; \\\n\ + VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x = coord.x; \\\n\ + \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4); \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4); \\\n\ + tmpData0 = tmpData0 - mean_vari.s0; \\\n\ + tmpData1 = tmpData1 - mean_vari.s0; \\\n\ + \\\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(conv_type, tmpVal0, norm); \\\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(conv_type, tmpVal1, norm); \\\n\ + \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_AXIS01_I16_F16to16Bits(F16,half4,vxc_half8,CONV)\n\ +LAYER_NORM_AXIS01_I16_F16to16Bits(I16,int4,vxc_short8,CONV_RTE)\n\ +\n\ +\n\ +#define LAYER_NORM_AXIS01_I16_F32to16Bits(name,temp_type,dst_type,conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I16_F32to##name( \\\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\ + image2d_array_t output, float eps) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int2 coord_sum = (int2)(0, gidz); \\\n\ + int4 coord_para = coord; \\\n\ + vxc_short8 src0; \\\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_sum); \\\n\ + coord_sum.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + coord_para.z = 0; \\\n\ + coord_para.w = 0; \\\n\ + int4 coord_bias = coord_para; \\\n\ + \\\n\ + int8 input_desc, scale_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr); \\\n\ + \\\n\ + vxc_float4 tmpData0, tmpData1, norm; \\\n\ + temp_type tmpVal0, tmpVal1; \\\n\ + vxc_short8 outval; \\\n\ + dst_type dst; \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.y ++; \\\n\ + coord_bias.y = coord.y; \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + scale_f0 = read_imagef(scale, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + scale_f1 = read_imagef(scale, coord_bias); \\\n\ + coord_bias.x = coord.x; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniDataToFP32_1_4x4); \\\n\ + tmpData0 = tmpData0 - mean_vari.s0; \\\n\ + tmpData1 = tmpData1 - mean_vari.s0; \\\n\ + \\\n\ + norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(conv_type, tmpVal0, norm); \\\n\ + norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(conv_type, tmpVal1, norm); \\\n\ + \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_AXIS01_I16_F32to16Bits(F16,half4,vxc_half8,CONV)\n\ +LAYER_NORM_AXIS01_I16_F32to16Bits(I16,int4,vxc_short8,CONV_RTE)\n\ +"; /* end of layer_normalization_axis01_3_vx*/ + +static const char layer_normalization_axis01_sum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniSumX_16x1;\n\ +_viv_uniform VXC_512Bits uniSumX2_16x1;\n\ +_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +\n\ +\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_F16toF32(\n\ + image2d_array_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr;\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSum_X_X2_8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0;\n\ + float sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_I16toF32(\n\ + image2d_array_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + float4 tmpSumSqr = (float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSum_X_X2_8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ + float4 data = (float4)(0);\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + data.x += dot(tmp_sum[i], one);\n\ + data.y += dot(tmp_sqr[i], one);\n\ + }\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_U8toF32(\n\ + image2d_array_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1);\n\ + tmpSqr += (tmpSqr1);\n\ + }\n\ + sqr += convert_float(tmpSqr);\n\ + sum = convert_float(tmpSum);\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_I8toF32(\n\ + image2d_array_t input, image2d_t output)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_char16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1);\n\ + tmpSqr += (tmpSqr1);\n\ + }\n\ + sqr += convert_float(tmpSqr);\n\ + sum = convert_float(tmpSum);\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +"; /* end of layer_normalization_axis01_sum_vx*/ + static const char log_softmax_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ _viv_uniform float rlogE;\n\ _viv_uniform int axisSize;\n\ @@ -20209,6 +22280,742 @@ __kernel void log_softmax_axis2_BF16toF32(\n\ }\n\ "; /* end of log_softmax_axis2_vx*/ +static const char log_softmax_exceed_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform float rlogE;\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float betaValue;\n\ +_viv_uniform float scaleLogE;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform int height;\n\ +_viv_uniform int inputWidth;\n\ +_viv_uniform int inputWidthRemain4;\n\ +_viv_uniform VXC_512Bits uniGetSubData0to3_4x4;\n\ +_viv_uniform VXC_512Bits uniGetSubData4to7_4x4;\n\ +_viv_uniform VXC_512Bits uniPackMaxData_2x8;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0(read_fun, vert_max_fun, horz_max_fun) \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, val, val0, 16); \\\n\ + for (coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + for (coord.x = 16;coord.x < (axisSize + 16);coord.x+=32) \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val1, val1, 16); \\\n\ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val2, val2, 16); \\\n\ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val3, val3, 16); \\\n\ + vert_max_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vert_max_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + horz_max_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \\\n\ + horz_max_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_float4 prob; \\\n\ + float fProbSum = 0; \\\n\ + const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \\\n\ + for (coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + for (coord.x = 0;coord.x < inputWidth;coord.x+=4) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + VXC_DP4x4(prob, img_val0, val,\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\ + prob *= scaleLogE; \\\n\ + prob = exp2(prob); \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + } \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\ + prob *= scaleLogE; \\\n\ + if(inputWidthRemain4 == 1) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.yzw = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + else if(inputWidthRemain4 == 2) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.y = exp2(prob.y); \\\n\ + prob.zw = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + else if(inputWidthRemain4 == 3) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.y = exp2(prob.y); \\\n\ + prob.z = exp2(prob.z); \\\n\ + prob.w = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + vxc_float4 probSum_log; \\\n\ + probSum_log.x = log2(fProbSum) * rlogE;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_SAVE(dst_type, \\\n\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\ + for (coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + for (coord.x = 0; coord.x < axisSize; coord.x += 8) \\\n\ + { \\\n\ + dst_type vec0, vec1; \\\n\ + save_type dst; \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + prob = prob * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, vec0, prob); \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + prob = prob * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, vec1, prob); \\\n\ + VXC_DP2x8(dst, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + }\n\ +\n\ +#define LOGSOFTMAX_EXCEED_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ + save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun, horz_max_fun) \\\n\ +__kernel void log_softmax_exceed_axis0_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(16, 0, get_global_id(1), 0); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\ +}\n\ +\n\ +LOGSOFTMAX_EXCEED_AXIS0(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_EXCEED_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_EXCEED_AXIS0(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_EXCEED_AXIS0(F16, U8, vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\ + CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_EXCEED_AXIS0(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS0(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS0(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS0(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\\\n\ +CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_TOF32_SAVE(read_fun) \\\n\ + for (coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + for (coord.x = 0; coord.x < axisSize; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + write_imagef(output, coord, prob); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + write_imagef(output, coord, prob); \\\n\ + coord.x += 4; \\\n\ + } \\\n\ + }\n\ +\n\ +#define LOGSOFTMAX_EXCEED_AXIS0_TOF32(src_name, src_type, copy_type, vert_max_fun, horz_max_fun) \\\n\ +__kernel void log_softmax_exceed_axis0_##src_name##toF32 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(16, 0, get_global_id(1), 0); \\\n\ + src_type img_val0, img_val1, img_val2, img_val3; \\\n\ + copy_type val0, val1, val2, val3; \\\n\ + src_type val; \\\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_TOF32_SAVE(VXC_ReadImage2DArray) \\\n\ +}\n\ +\n\ +LOGSOFTMAX_EXCEED_AXIS0_TOF32(F16, vxc_half8, vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\ +LOGSOFTMAX_EXCEED_AXIS0_TOF32(I16, vxc_short8, vxc_short8, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS0_TOF32(I8, vxc_char16, vxc_char16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS0_TOF32(U8, vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\ +\n\ +"; /* end of log_softmax_exceed_axis0_vx*/ + +static const char log_softmax_exceed_axis0_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform float rlogE;\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float betaValue;\n\ +_viv_uniform float scaleLogE;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +\n\ +_viv_uniform int height;\n\ +_viv_uniform int inputWidth;\n\ +_viv_uniform int inputWidthRemain4;\n\ +_viv_uniform VXC_512Bits uniPackMaxData_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(read_fun) \\\n\ + vxc_half8 img_val0, img_val1, img_val2, img_val3; \\\n\ + vxc_short8 val0, val1, val2, val3; \\\n\ + vxc_half8 val; \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, val, val0, 16); \\\n\ + for (coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + for (coord.x = 16; coord.x < (axisSize + 16);) \\\n\ + { \\\n\ + read_fun(val0, input, coord, VXC_5BITOFFSET_XY(-16, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val0, val0, 16); \\\n\ + read_fun(val1, input, coord, VXC_5BITOFFSET_XY(-8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val1, val1, 16); \\\n\ + read_fun(val2, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val2, val2, 16); \\\n\ + read_fun(val3, input, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, img_val3, val3, 16); \\\n\ + coord.x += 32; \\\n\ + VXC_VertMax3_Half(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_VertMax3_Half(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \\\n\ + VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + vxc_ushort8 bf_val_tmp; \\\n\ + vxc_float4 vecA; \\\n\ + _viv_asm(COPY, bf_val_tmp, val, 16); \\\n\ + VXC_DP2x8(bf_val_tmp, bf_val_tmp, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA, bf_val_tmp, 16); \\\n\ + vxc_float4 prob; \\\n\ + float fProbSum = 0; \\\n\ + const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \\\n\ + float max_value = vecA.x * scaleLogE; \\\n\ + float max_value_orig = vecA.x; \\\n\ + for (coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + for (coord.x = 0; coord.x < inputWidth; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob * scaleLogE - max_value; \\\n\ + prob = exp2(prob); \\\n\ + fProbSum += dot(prob, one4); \\\n\ + coord.x += 4; \\\n\ + } \\\n\ + } \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob * scaleLogE - max_value; \\\n\ + if(inputWidthRemain4 == 1) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.yzw = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + else if(inputWidthRemain4 == 2) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.y = exp2(prob.y); \\\n\ + prob.zw = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + else if(inputWidthRemain4 == 3) \\\n\ + { \\\n\ + prob.x = exp2(prob.x); \\\n\ + prob.y = exp2(prob.y); \\\n\ + prob.z = exp2(prob.z); \\\n\ + prob.w = 0; \\\n\ + fProbSum += dot(prob, one4); \\\n\ + } \\\n\ + vxc_float4 probSum_log; \\\n\ + probSum_log.x = log2(fProbSum) * rlogE;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOBF16_SAVE(read_fun, write_fun) \\\n\ + for (coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + for (coord.x = 0; coord.x < axisSize; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob - max_value_orig; \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + vxc_ushort8 tmp, dst; \\\n\ + _viv_asm(COPY, tmp, prob, 16); \\\n\ + dst.s0123 = tmp.s1357; \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 4; \\\n\ + } \\\n\ + }\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF16_SAVE(read_fun, write_fun) \\\n\ + for (coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + for (coord.x = 0; coord.x < axisSize; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob - max_value_orig; \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + half4 vec; \\\n\ + vxc_half4 tmp; \\\n\ + vxc_short4 dst; \\\n\ + _viv_asm(CONV, vec, prob); \\\n\ + VXC_DP4x4(tmp, vec, vec, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\ + _viv_asm(COPY, dst, tmp, 8); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 4; \\\n\ + } \\\n\ + }\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF32_SAVE(read_fun) \\\n\ + for (coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + for (coord.x = 0; coord.x < axisSize; ) \\\n\ + { \\\n\ + read_fun(val0, input, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\ + prob = prob - max_value_orig; \\\n\ + prob = prob * betaValue - probSum_log.xxxx; \\\n\ + write_imagef(output, coord, prob); \\\n\ + coord.x += 4; \\\n\ + } \\\n\ + }\n\ +\n\ +__kernel void log_softmax_exceed_axis0_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(16, 0, get_global_id(1), 0);\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray)\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOBF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +__kernel void log_softmax_exceed_axis0_BF16toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(16, 0, get_global_id(1), 0);\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray)\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ +}\n\ +__kernel void log_softmax_exceed_axis0_BF16toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(16, 0, get_global_id(1), 0);\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray)\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF32_SAVE(VXC_ReadImage2DArray)\n\ +}\n\ +\n\ +"; /* end of log_softmax_exceed_axis0_BF16_vx*/ + +static const char log_softmax_exceed_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float rlogE;\n\ +_viv_uniform int depth;\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float betaValue;\n\ +_viv_uniform float scaleLogE;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_offset_asymmetric;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniGetSubLoData_4x4;\n\ +_viv_uniform VXC_512Bits uniGetSubHiData_4x4;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1(read_fun, vert_max_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, max, in0, 16); \\\n\ + for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\ + { \\\n\ + for (coord.y = 0; coord.y < axisSize;) \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + } \\\n\ + } \\\n\ + coord.y = 0; \\\n\ + sum0 = 0; \\\n\ + sum1 = 0; \\\n\ + for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\ + { \\\n\ + for (coord.y = 0; coord.y < axisSize;coord.y++) \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\ + VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 *= scaleLogE; \\\n\ + data0 = exp2(data0); \\\n\ + sum0 += data0; \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\ + VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 *= scaleLogE; \\\n\ + data0 = exp2(data0); \\\n\ + sum1 += data0; \\\n\ + } \\\n\ + } \\\n\ + sum0 = log2(sum0) * rlogE; \\\n\ + sum1 = log2(sum1) * rlogE;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\ + coord.y = 0; \\\n\ + dst_type dst0, dst1; \\\n\ + save_type vect; \\\n\ + for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\ + { \\\n\ + for (coord.y = 0; coord.y < axisSize;) \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\ + VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 = data0 * betaValue - sum0; \\\n\ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst0, data0); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\ + VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 = data0 * betaValue - sum1; \\\n\ + data0 = data0 * OUT_SCALE + OUT_OFFSET; \\\n\ + _viv_asm(conv_mode, dst1, data0); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, \\\n\ + VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + } \\\n\ + } \\\n\ +\n\ +#define LOGSOFTMAX_EXCEED_AXIS1(src_name, dst_name, src_type, copy_type, dst_type,\\\n\ +save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \\\n\ +__kernel void log_softmax_exceed_axis1_##src_name##to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, 0, 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + vxc_float4 sum0, sum1; \\\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\ + OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\ +}\n\ +\n\ +\n\ +\n\ +LOGSOFTMAX_EXCEED_AXIS1(F16, F16, vxc_half8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_EXCEED_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_EXCEED_AXIS1(F16, I8, vxc_half8, vxc_short8, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_EXCEED_AXIS1(F16, U8, vxc_half8, vxc_short8, uchar4,\\\n\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half)\n\ +\n\ +LOGSOFTMAX_EXCEED_AXIS1(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +LOGSOFTMAX_EXCEED_AXIS1(I8, I8, vxc_char16, vxc_char16, char4, vxc_char8,\\\n\ +CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_short8,\\\n\ +CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +LOGSOFTMAX_EXCEED_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, uchar4,\\\n\ +vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, \\\n\ +vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer)\n\ +\n\ +\n\ +\n\ +#define LOGSOFTMAX_EXCEED_AXIS1_TOF32(src_name, src_type, copy_type, vert_max_fun) \\\n\ +__kernel void log_softmax_exceed_axis1_##src_name##toF32 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_Scale, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, 0, 0); \\\n\ + src_type vec0, max; \\\n\ + copy_type in0; \\\n\ + vxc_float4 data0; \\\n\ + vxc_float4 sum0, sum1; \\\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \\\n\ + coord.y = 0; \\\n\ + for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\ + { \\\n\ + for (coord.y = 0; coord.y < axisSize;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\ + VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\ + data0 = data0 * betaValue - sum0; \\\n\ + write_imagef(output, coord, data0); \\\n\ + coord.x += 4; \\\n\ + VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\ + VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\ + data0 = data0 * betaValue - sum1; \\\n\ + write_imagef(output, coord, data0); \\\n\ + coord.x -= 4; \\\n\ + coord.y++; \\\n\ + } \\\n\ + } \\\n\ +}\n\ +\n\ +LOGSOFTMAX_EXCEED_AXIS1_TOF32(F16, vxc_half8, \\\n\ +vxc_short8, VXC_VertMax3_Half)\n\ +LOGSOFTMAX_EXCEED_AXIS1_TOF32(I16, vxc_short8, \\\n\ +vxc_short8, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS1_TOF32(I8, vxc_char16, \\\n\ +vxc_char16, VXC_VertMax3_Integer)\n\ +LOGSOFTMAX_EXCEED_AXIS1_TOF32(U8, vxc_uchar16, \\\n\ +vxc_uchar16, VXC_VertMax3_Integer)"; /* end of log_softmax_exceed_axis1_vx*/ + +static const char log_softmax_exceed_axis1_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float rlogE;\n\ +_viv_uniform int depth;\n\ +_viv_uniform int axisSize;\n\ +_viv_uniform float betaValue;\n\ +_viv_uniform float scaleLogE;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(read_fun) \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, max, in0, 16); \\\n\ + for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\ + { \\\n\ + for (coord.y = 0; coord.y < axisSize;) \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, vec0, in0, 16); \\\n\ + VXC_VertMax3_Half(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + } \\\n\ + } \\\n\ + _viv_asm(COPY, tmp0, max, 16); \\\n\ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, max_lo, tmp1, 16); \\\n\ + VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, max_hi, tmp1, 16); \\\n\ + coord.y = 0; \\\n\ + sum0 = 0; \\\n\ + sum1 = 0; \\\n\ + for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\ + { \\\n\ + for (coord.y = 0; coord.y < axisSize;) \\\n\ + { \\\n\ + read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data0, tmp1, 16); \\\n\ + data0 = data0 - max_lo; \\\n\ + data0 *= scaleLogE; \\\n\ + sum0 += exp2(data0); \\\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, data0, tmp1, 16); \\\n\ + data0 = data0 - max_hi; \\\n\ + data0 *= scaleLogE; \\\n\ + sum1 += exp2(data0); \\\n\ + coord.y++; \\\n\ + } \\\n\ + } \\\n\ + sum0 = log2(sum0) * rlogE; \\\n\ + sum1 = log2(sum1) * rlogE;\n\ +\n\ +__kernel void log_softmax_exceed_axis1_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, 0, 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray)\n\ +\n\ + coord.y = 0;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + for (coord.z = 0; coord.z < depth; coord.z ++)\n\ + {\n\ + for (coord.y = 0; coord.y < axisSize;)\n\ + {\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + _viv_asm(COPY, dst0, data0, 16);\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + _viv_asm(COPY, dst1, data0, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_exceed_axis1_BF16toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, 0, 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray)\n\ +\n\ + coord.y = 0;\n\ + half4 dst0, dst1;\n\ + for (coord.z = 0; coord.z < depth; coord.z ++)\n\ + {\n\ + for (coord.y = 0; coord.y < axisSize;)\n\ + {\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + _viv_asm(CONV, dst0, data0);\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + _viv_asm(CONV, dst1, data0);\n\ + VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\ + vxc_short8 vect;\n\ + _viv_asm(COPY, vect, vec0, 16);\n\ + VXC_WriteImage2DArray(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_exceed_axis1_BF16toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float input_Scale,\n\ + int axisVal )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, 0, 0);\n\ + vxc_short8 in0;\n\ + vxc_half8 vec0, max;\n\ + vxc_float4 data0;\n\ + vxc_float4 sum0, sum1;\n\ + vxc_float4 max_lo, max_hi;\n\ + vxc_ushort8 tmp0, tmp1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray)\n\ +\n\ + coord.y = 0;\n\ + for (coord.z = 0; coord.z < depth; coord.z ++)\n\ + {\n\ + for (coord.y = 0; coord.y < axisSize;)\n\ + {\n\ + VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_lo;\n\ + data0 = data0 * betaValue - sum0;\n\ + write_imagef(output, coord, data0);\n\ + coord.x += 4;\n\ + VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, tmp1, 16);\n\ + data0 = data0 - max_hi;\n\ + data0 = data0 * betaValue - sum1;\n\ + write_imagef(output, coord, data0);\n\ + coord.x -= 4;\n\ + coord.y++;\n\ + }\n\ + }\n\ +}\n\ +"; /* end of log_softmax_exceed_axis1_BF16_vx*/ + static const char logical_not_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void logical_not_I8toI8(\n\ @@ -36020,6 +38827,368 @@ NV12_COPY_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\ NV12_COPY_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\ "; /* end of pre_process_nv12_copy_vx*/ +static const char pre_process_nv12_rggb_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +_viv_uniform float outputScaleVar_b;\n\ +_viv_uniform float outputScaleVar_g;\n\ +_viv_uniform float outputScaleVar_r;\n\ +\n\ +_viv_uniform float bMeanScaleVarZp;\n\ +_viv_uniform float gMeanScaleVarZp;\n\ +_viv_uniform float rMeanScaleVarZp;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractYtoShortSub16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;\n\ +\n\ +#define NV12_RGGB_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ +__kernel void pre_process_nv12_rggb_copy_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t uv_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int* xRatio, \\\n\ + global int* yRatio, \\\n\ + global int* xOffset, \\\n\ + global int* yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse_channel, \\\n\ + int trans, \\\n\ + int nv_type, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int sy = gidy + (*yOffset); \\\n\ + int sx = gidx + (*xOffset); \\\n\ + int uvX = sx & 0xfffffffe; \\\n\ + int uvY = sy >> 1; \\\n\ + \\\n\ + vxc_uchar16 Y, UV; \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + if (nv_type == 3) \\\n\ + { \\\n\ + UV.s0123 = UV.s1032; \\\n\ + } \\\n\ + \\\n\ + vxc_short8 tmpY; \\\n\ + vxc_char16 tmpUV; \\\n\ + short tmpVal = 16; \\\n\ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_2x8); \\\n\ + tmpVal = 128; \\\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\ + \\\n\ + float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \\\n\ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + \\\n\ + conv_type result; \\\n\ + dst_type dst0; \\\n\ + save_type dst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ + dstPos.z = bOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ + dstPos.z = 1; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + dstPos.z = 2; \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ + dstPos.z = rOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +NV12_RGGB_COPY_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\ +NV12_RGGB_COPY_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\ +NV12_RGGB_COPY_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\ +NV12_RGGB_COPY_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\ +"; /* end of pre_process_nv12_rggb_copy_vx*/ + +static const char pre_process_nv12_rggb_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int bOrder;\n\ +_viv_uniform int rOrder;\n\ +\n\ +_viv_uniform float outputScaleVar_b;\n\ +_viv_uniform float outputScaleVar_g;\n\ +_viv_uniform float outputScaleVar_r;\n\ +\n\ +_viv_uniform float bMeanScaleVarZp;\n\ +_viv_uniform float gMeanScaleVarZp;\n\ +_viv_uniform float rMeanScaleVarZp;\n\ +\n\ +_viv_uniform uint xrIntFloat_16;\n\ +_viv_uniform uint yrIntFloat_16;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertYtoShortSub16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniCalculateYShift_2x8;\n\ +_viv_uniform VXC_512Bits uniCalculateUVShift_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;\n\ +\n\ +#define NV12_RGGB_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ +__kernel void pre_process_nv12_rggb_scale_##name##_gq \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t uv_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int* xRatio, \\\n\ + global int* yRatio, \\\n\ + global int* xOffset, \\\n\ + global int* yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse_channel, \\\n\ + int trans, \\\n\ + int nv_type, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + uint4 gidx = get_global_id(0); \\\n\ + uint gidy = get_global_id(1); \\\n\ + gidx += (uint4)(0, 1, 2, 3); \\\n\ + \\\n\ + uint dy = (gidy * yrIntFloat_16) >> 16; \\\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16; \\\n\ + int sy = convert_int(dy) + (*yOffset); \\\n\ + int4 sx = convert_int4(dx) + (*xOffset); \\\n\ + int4 uvX = sx & 0xfffffffe; \\\n\ + int uvY = sy >> 1; \\\n\ + \\\n\ + vxc_uchar16 Y, UV; \\\n\ + int2 coord = (int2)(sx.x, sy); \\\n\ + int2 coord_uv = (int2)(uvX.x, uvY); \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + if (nv_type == 3) \\\n\ + { \\\n\ + UV.s0123456789abcdef = UV.s1032547698badcfe; \\\n\ + } \\\n\ + \\\n\ + vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\ + vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\ + int4 offsetUV = uvX - uvX.x; \\\n\ + \\\n\ + vxc_ushort8 diffY, diffUV; \\\n\ + _viv_asm(COPY, diffY, sx, 16); \\\n\ + _viv_asm(COPY, diffUV, offsetUV, 16); \\\n\ + \\\n\ + vxc_ushort8 constData = 8; \\\n\ + VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniCalculateYShift_2x8); \\\n\ + VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniCalculateUVShift_2x8); \\\n\ + VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_short8 tmpY; \\\n\ + vxc_char16 tmpUV; \\\n\ + short tmpVal = 16; \\\n\ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \\\n\ + tmpVal = 128; \\\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\ + \\\n\ + float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \\\n\ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + \\\n\ + conv_type result; \\\n\ + dst_type dst0; \\\n\ + save_type dst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ + dstPos.z = bOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ + dstPos.z = 1; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + dstPos.z = 2; \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ + dstPos.z = rOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +NV12_RGGB_OPT_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\ +NV12_RGGB_OPT_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\ +NV12_RGGB_OPT_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\ +NV12_RGGB_OPT_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\ +\n\ +#define NV12_RGGB_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ +__kernel void pre_process_nv12_rggb_scale_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t y_img, \\\n\ + __read_only image2d_array_t uv_img, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int* xRatio, \\\n\ + global int* yRatio, \\\n\ + global int* xOffset, \\\n\ + global int* yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse_channel, \\\n\ + int trans, \\\n\ + int nv_type, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + uint4 gidx = get_global_id(0); \\\n\ + uint gidy = get_global_id(1); \\\n\ + gidx += (uint4)(0, 1, 2, 3); \\\n\ + \\\n\ + uint dy = (gidy * yrIntFloat_16) >> 16; \\\n\ + uint4 dx = (gidx * xrIntFloat_16) >> 16; \\\n\ + int sy = convert_int(dy) + (*yOffset); \\\n\ + int4 sx = convert_int4(dx) + (*xOffset); \\\n\ + int4 uvX = sx & 0xfffffffe; \\\n\ + int uvY = sy >> 1; \\\n\ + \\\n\ + vxc_uchar16 Y, UV; \\\n\ + int2 coord = (int2)(sx.x, sy); \\\n\ + int2 coord_uv = (int2)(uvX.x, uvY); \\\n\ + \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x = sx.y; \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x = sx.z; \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x = sx.w; \\\n\ + VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_uv.x = uvX.y; \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_uv.x = uvX.z; \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_uv.x = uvX.w; \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + if (nv_type == 3) \\\n\ + { \\\n\ + UV.s01234567 = UV.s10325476; \\\n\ + } \\\n\ + \\\n\ + vxc_short8 tmpY; \\\n\ + vxc_char16 tmpUV; \\\n\ + short tmpVal = 16; \\\n\ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \\\n\ + tmpVal = 128; \\\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\ + \\\n\ + float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \\\n\ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + \\\n\ + conv_type result; \\\n\ + dst_type dst0; \\\n\ + save_type dst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ + dstPos.z = bOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ + dstPos.z = 1; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + dstPos.z = 2; \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ + dstPos.z = rOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +NV12_RGGB_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\ +NV12_RGGB_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\ +NV12_RGGB_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\ +NV12_RGGB_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\ +"; /* end of pre_process_nv12_rggb_scale_vx*/ + static const char pre_process_nv12_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int bOrder;\n\ @@ -46004,80 +49173,79 @@ __kernel void resize_bilinear_F16toF16_DOWN\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void resize_bilinear_F16toU8_DOWN\n\ - (\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ - int align_corners,\n\ - int half_pixel_centers\n\ - )\n\ -{\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ - float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ - float4 left_x_f = floor(in_x);\n\ - float4 x_lerp = in_x - left_x_f;\n\ - int4 left_x_idx = convert_int4(left_x_f);\n\ - float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ - float top_y_f = floor(in_y);\n\ - float y_lerp = in_y - top_y_f;\n\ - int top_y_idx = convert_int(top_y_f);\n\ -\n\ - vxc_short8 top_short, bottom_short;\n\ - vxc_half8 top, bottom;\n\ - int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.w, baseAddr);\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.y;\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.z;\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = left_x_idx.w;\n\ - VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, top, top_short, 16);\n\ - _viv_asm(COPY, bottom, bottom_short, 16);\n\ -\n\ - float4 left4;\n\ - float4 right4;\n\ - float4 top4;\n\ - float4 bottom4;\n\ - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ - top4 = right4 * x_lerp + left4;\n\ - VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\ - VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ - bottom4 = right4 * x_lerp + left4;\n\ - bottom4 -= top4;\n\ - float4 dst4 = bottom4 * y_lerp + top4;\n\ - dst4 = dst4 * uint8Scale + output_ZP;\n\ - int4 dst = convert_int4_rte(dst4);\n\ - vxc_uchar8 dst_uchar;\n\ - VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ -\n\ - int8 output_desc;\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.w, baseAddr);\n\ -\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar,\n\ - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +#define RESIZE_BILINEAR_F16TOQINT_DOWN(out_name, dst_type) \\\n\ +__kernel void resize_bilinear_F16to##out_name##_DOWN( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int align_corners, \\\n\ + int half_pixel_centers \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3); \\\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; \\\n\ + float4 left_x_f = floor(in_x); \\\n\ + float4 x_lerp = in_x - left_x_f; \\\n\ + int4 left_x_idx = convert_int4(left_x_f); \\\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; \\\n\ + float top_y_f = floor(in_y); \\\n\ + float y_lerp = in_y - top_y_f; \\\n\ + int top_y_idx = convert_int(top_y_f); \\\n\ + \\\n\ + vxc_short8 top_short, bottom_short; \\\n\ + vxc_half8 top, bottom; \\\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = left_x_idx.y; \\\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = left_x_idx.z; \\\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = left_x_idx.w; \\\n\ + VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, top, top_short, 16); \\\n\ + _viv_asm(COPY, bottom, bottom_short, 16); \\\n\ + \\\n\ + float4 left4, right4, top4, bottom4; \\\n\ + VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + top4 = right4 * x_lerp + left4; \\\n\ + VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); \\\n\ + VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\ + bottom4 = right4 * x_lerp + left4; \\\n\ + bottom4 -= top4; \\\n\ + float4 dst4 = bottom4 * y_lerp + top4; \\\n\ + dst4 = dst4 * uint8Scale + output_ZP; \\\n\ + int4 dst = convert_int4_rte(dst4); \\\n\ + dst_type dst_uchar; \\\n\ + VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ +RESIZE_BILINEAR_F16TOQINT_DOWN(U8, vxc_uchar8)\n\ +RESIZE_BILINEAR_F16TOQINT_DOWN(U16, vxc_ushort8)\n\ \n\ __kernel void resize_bilinear_F16toF16_UP\n\ (\n\ @@ -46198,14 +49366,16 @@ __kernel void resize_bilinear_F16toF16_UP\n\ static const char resize_bilinear_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\ _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ _viv_uniform float2 scale_xy;\n\ _viv_uniform int depth;\n\ +_viv_uniform int input_ZP;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ -_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\ -_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ -_viv_uniform float dfpScale;\n\ _viv_uniform float half_pixel_value;\n\ \n\ __kernel void resize_bilinear_I16toI16_UP\n\ @@ -46253,24 +49423,24 @@ __kernel void resize_bilinear_I16toI16_UP\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + short inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ \n\ vxc_ushort8 bitextract_p0;\n\ vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ vxc_ushort8 constData = 16;\n\ VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ \n\ int8 output_desc;\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ - float4 left4;\n\ - float4 right4;\n\ - float4 top4;\n\ - float4 bottom4;\n\ + float4 left4, right4, top4, bottom4;\n\ \n\ int loop = depth - 1;\n\ while (coord_in.z < loop)\n\ @@ -46289,18 +49459,18 @@ __kernel void resize_bilinear_I16toI16_UP\n\ VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ \n\ - VXC_DP4x4(left4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ - dst4 = dst4 * dfpScale;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ int4 dst = convert_int4_rte(dst4);\n\ \n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ @@ -46313,17 +49483,17 @@ __kernel void resize_bilinear_I16toI16_UP\n\ VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, top, dst0, 16);\n\ _viv_asm(COPY, bottom, dst1, 16);\n\ - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ - VXC_DP4x4(left4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ - dst4 = dst4 * dfpScale;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ @@ -46378,25 +49548,25 @@ __kernel void resize_bilinear_I16toI16_DOWN\n\ VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ - float4 left4;\n\ - float4 right4;\n\ - float4 top4;\n\ - float4 bottom4;\n\ + short inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ \n\ - VXC_DP4x4(left4, top, top, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ - VXC_DP4x4(right4, top, top, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + float4 left4, right4, top4, bottom4;\n\ +\n\ + VXC_DP4x4(left4, top, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ \n\ - VXC_DP4x4(left4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ - VXC_DP4x4(right4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ - dst4 = dst4 * dfpScale;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ int4 dst = convert_int4_rte(dst4);\n\ \n\ VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ @@ -46407,21 +49577,23 @@ __kernel void resize_bilinear_I16toI16_DOWN\n\ _viv_asm(MOV, coord_out.w, baseAddr);\n\ \n\ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ "; /* end of resize_bilinear_I16_vx*/ static const char resize_bilinear_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\ _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ _viv_uniform float2 scale_xy;\n\ _viv_uniform int depth;\n\ +_viv_uniform int input_ZP;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ -_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\ -_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ -_viv_uniform float dfpScale;\n\ _viv_uniform float half_pixel_value;\n\ \n\ __kernel void resize_bilinear_I8toI8_UP\n\ @@ -46465,14 +49637,17 @@ __kernel void resize_bilinear_I8toI8_UP\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + short inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ \n\ vxc_ushort8 bitextract_p0;\n\ vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ vxc_ushort8 constData = 8;\n\ VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ \n\ int8 output_desc;\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ @@ -46498,22 +49673,22 @@ __kernel void resize_bilinear_I8toI8_UP\n\ VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ - VXC_DP4x4(left4, top, top, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(left4, top, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, top, top, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ \n\ top4 = right4 * x_lerp + left4;\n\ \n\ - VXC_DP4x4(left4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ \n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ - dst4 = dst4 * dfpScale;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ \n\ @@ -46525,19 +49700,19 @@ __kernel void resize_bilinear_I8toI8_UP\n\ VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, top, dst0, 16);\n\ _viv_asm(COPY, bottom, dst1, 16);\n\ - VXC_DP4x4(left4, top, top, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(left4, top, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, top, top, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ - VXC_DP4x4(left4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ - dst4 = dst4 * dfpScale;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ int4 dst = convert_int4_rte(dst4);\n\ VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ @@ -46587,26 +49762,29 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + short inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ \n\ float4 left4;\n\ float4 right4;\n\ float4 top4;\n\ float4 bottom4;\n\ \n\ - VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ - VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ \n\ - VXC_DP4x4(left4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\ - VXC_DP4x4(right4, bottom, bottom, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ bottom4 = right4 * x_lerp + left4;\n\ \n\ bottom4 -= top4;\n\ float4 dst4 = bottom4 * y_lerp + top4;\n\ \n\ - dst4 = dst4 * dfpScale;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ \n\ int4 dst = convert_int4_rte(dst4);\n\ \n\ @@ -46620,6 +49798,286 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\ }\n\ "; /* end of resize_bilinear_I8_vx*/ +static const char resize_bilinear_U16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\ +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float2 scale_xy;\n\ +_viv_uniform int depth;\n\ +_viv_uniform int input_ZP;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\ +_viv_uniform float half_pixel_value;\n\ +\n\ +__kernel void resize_bilinear_U16toF16_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + vxc_ushort8 top, bottom;\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4, right4, top4, bottom4;\n\ +\n\ + short inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ +\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ +\n\ + dst4 *= uint8Scale;\n\ +\n\ + half4 dst;\n\ + _viv_asm(CONV, dst, dst4);\n\ +\n\ + vxc_short8 dst_short;\n\ + _viv_asm(COPY, dst_short, dst, 16);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_short.s0246,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_bilinear_U16toU16_UP\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ +\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float4 right_x_f = ceil(in_x);\n\ + int4 right_x_idx = convert_int4(right_x_f);\n\ +\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ +\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ +\n\ + vxc_ushort8 src0, src1, src2, src3;\n\ +\n\ + vxc_ushort8 top;\n\ + vxc_ushort8 bottom;\n\ +\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + short inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ +\n\ + vxc_ushort8 bitextract_p0;\n\ + vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\ + VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\ + vxc_ushort8 constData = 16;\n\ + VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 left4, right4, top4, bottom4;\n\ +\n\ + int loop = depth - 1;\n\ + while (coord_in.z < loop)\n\ + {\n\ + VXC_BitExtract(top, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(bottom, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.zw += (int2)(1, input_desc.s4);\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.zw += (int2)(1, output_desc.s4);\n\ + }\n\ +\n\ + VXC_BitExtract(top, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BitExtract(bottom, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom, inputZP, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, bottom, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ + int4 dst = convert_int4_rte(dst4);\n\ + VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void resize_bilinear_U16toU16_DOWN\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_x = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\ + float4 in_x = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\ + float4 left_x_f = floor(in_x);\n\ + float4 x_lerp = in_x - left_x_f;\n\ + int4 left_x_idx = convert_int4(left_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + vxc_ushort8 top, bottom, result;\n\ + int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.y;\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.z;\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = left_x_idx.w;\n\ + VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4, right4, top4, bottom4;\n\ +\n\ + short inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ + VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ +\n\ + VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ + VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ +\n\ + bottom4 -= top4;\n\ + float4 dst4 = bottom4 * y_lerp + top4;\n\ +\n\ + dst4 = dst4 * uint8Scale + output_ZP;\n\ + int4 dst = convert_int4_rte(dst4);\n\ +\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of resize_bilinear_U16_vx*/ + static const char resize_bilinear_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\ @@ -46754,6 +50212,9 @@ __kernel void resize_bilinear_U8toU8_UP\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + short inputZP;\n\ + _viv_asm(COPY, inputZP, input_ZP, 4);\n\ \n\ vxc_ushort8 bitextract_p0;\n\ vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\ @@ -46785,8 +50246,6 @@ __kernel void resize_bilinear_U8toU8_UP\n\ VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ - unsigned char inputZP;\n\ - _viv_asm(COPY, inputZP, input_ZP, 4);\n\ VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ @@ -46807,8 +50266,7 @@ __kernel void resize_bilinear_U8toU8_UP\n\ \n\ VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - unsigned char inputZP;\n\ - _viv_asm(COPY, inputZP, input_ZP, 4);\n\ +\n\ VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\ VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ top4 = right4 * x_lerp + left4;\n\ @@ -47825,6 +51283,277 @@ __kernel void resize_bilinear_nhwc_bound_U8toU8_4x\n\ }\n\ "; /* end of resize_bilinear_nhwc_bound_vx*/ +static const char resize_cubic_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_tail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_tail;\n\ +_viv_uniform VXC_512Bits uniFp16ToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtract8Bit_2x8;\n\ +\n\ +#define RESIZE_CUBIC_PART0 \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_index = coord_out; \\\n\ + int2 coord_scalew = (int2)(4 * get_global_id(0), 0); \\\n\ + int2 coord_scaleh = (int2)(4 * get_global_id(1), 0); \\\n\ + float4 cubic_coeffs_y; \\\n\ + float4 cubic_coeffs_x; \\\n\ + int4 coord_in = (int4)(0, 0, coord_out.z, 0); \\\n\ + float4 src0_f,src1_f,src2_f,src3_f; \\\n\ + float4 dst = (float4)(0,0,0,0); \\\n\ + float sum[4]; \\\n\ + int i = 0; \\\n\ + \\\n\ + Image scalew = create_image_from_image2d(scale_w, 4); \\\n\ + Image scaleh = create_image_from_image2d(scale_h, 4); \\\n\ + \\\n\ + uchar* scale_w_ptr = get_image_ptr_from_coord(scalew, coord_scalew); \\\n\ + __global float* scale_x = (__global float*)scale_w_ptr; \\\n\ + \\\n\ + uchar* scale_h_ptr = get_image_ptr_from_coord(scaleh, coord_scaleh); \\\n\ + __global float* scale_y = (__global float*)scale_h_ptr; \\\n\ + cubic_coeffs_y = vload4(0, scale_y); \\\n\ + \\\n\ + int index_y = read_imagei(index_h, coord_index.yw).x; \\\n\ + coord_in.y = index_y; \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ +#define RESIZE_CUBIC_16Bitsto16Bits(name,src_type,dst_type,temp_type) \\\n\ +__kernel void resize_cubic_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + __read_only image2d_t scale_w, \\\n\ + __read_only image2d_t scale_h, \\\n\ + __read_only image2d_t index_w, \\\n\ + __read_only image2d_t index_h \\\n\ + ) \\\n\ +{ \\\n\ + RESIZE_CUBIC_PART0; \\\n\ + src_type src0_h,src1_h,src2_h,src3_h; \\\n\ + vxc_short4 src0,src1,src2,src3; \\\n\ + for (i = 0; i < 4; i++) \\\n\ + { \\\n\ + coord_in.x = read_imagei(index_w, coord_index.xw).x; \\\n\ + cubic_coeffs_x = vload4(i, scale_x); \\\n\ + coord_index.x = coord_index.x + 1; \\\n\ + \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + _viv_asm(COPY, src0_h, src0, 8); \\\n\ + _viv_asm(COPY, src1_h, src1, 8); \\\n\ + _viv_asm(COPY, src2_h, src2, 8); \\\n\ + _viv_asm(COPY, src3_h, src3, 8); \\\n\ + \\\n\ + VXC_DP4x4(src0_f, src0_h, src0_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src1_f, src1_h, src1_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src2_f, src2_h, src2_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src3_f, src3_h, src3_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + \\\n\ + dst = src0_f * cubic_coeffs_y.x \\\n\ + + src1_f * cubic_coeffs_y.y \\\n\ + + src2_f * cubic_coeffs_y.z \\\n\ + + src3_f * cubic_coeffs_y.w; \\\n\ + sum[i] = dot(dst, cubic_coeffs_x); \\\n\ + } \\\n\ + float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \\\n\ + temp_type tmpout; \\\n\ + _viv_asm(CONV,tmpout,sum_f); \\\n\ + dst_type out_h; \\\n\ + vxc_short4 out; \\\n\ + VXC_DP2x8(out_h, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\ + _viv_asm(COPY, out, out_h, 8); \\\n\ + \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +RESIZE_CUBIC_16Bitsto16Bits(F16toF16,vxc_half4, vxc_half4, half4)\n\ +RESIZE_CUBIC_16Bitsto16Bits(I16toI16,vxc_short4,vxc_short4,short4)\n\ +RESIZE_CUBIC_16Bitsto16Bits(F16toI16,vxc_half4, vxc_short4,short4)\n\ +RESIZE_CUBIC_16Bitsto16Bits(I16toF16,vxc_short4,vxc_half4, half4)\n\ +\n\ +\n\ +#define RESIZE_CUBIC_Quant8toQuant8(name,data_type) \\\n\ +__kernel void resize_cubic_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + __read_only image2d_t scale_w, \\\n\ + __read_only image2d_t scale_h, \\\n\ + __read_only image2d_t index_w, \\\n\ + __read_only image2d_t index_h \\\n\ + ) \\\n\ +{ \\\n\ + RESIZE_CUBIC_PART0; \\\n\ + data_type src0,src1,src2,src3; \\\n\ + for (i = 0; i < 4; i++) \\\n\ + { \\\n\ + coord_in.x = read_imagei(index_w, coord_index.xw).x; \\\n\ + cubic_coeffs_x = vload4(i, scale_x); \\\n\ + coord_index.x = coord_index.x + 1; \\\n\ + \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src2_f, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src3_f, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + src0_f = src0_f * input_scale + input_tail; \\\n\ + src1_f = src1_f * input_scale + input_tail; \\\n\ + src2_f = src2_f * input_scale + input_tail; \\\n\ + src3_f = src3_f * input_scale + input_tail; \\\n\ + \\\n\ + dst = src0_f * cubic_coeffs_y.x \\\n\ + + src1_f * cubic_coeffs_y.y \\\n\ + + src2_f * cubic_coeffs_y.z \\\n\ + + src3_f * cubic_coeffs_y.w; \\\n\ + sum[i] = dot(dst, cubic_coeffs_x); \\\n\ + sum[i] = sum[i] * output_scale + output_tail; \\\n\ + } \\\n\ + float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \\\n\ + int4 tmpout; \\\n\ + _viv_asm(CONV,tmpout,sum_f); \\\n\ + data_type out; \\\n\ + VXC_DP2x8(out, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\ + \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +RESIZE_CUBIC_Quant8toQuant8(U8toU8,vxc_uchar4)\n\ +RESIZE_CUBIC_Quant8toQuant8(I8toI8,vxc_char4 )\n\ +\n\ +#define RESIZE_CUBIC_F16toQuant8(name,dst_type) \\\n\ +__kernel void resize_cubic_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + __read_only image2d_t scale_w, \\\n\ + __read_only image2d_t scale_h, \\\n\ + __read_only image2d_t index_w, \\\n\ + __read_only image2d_t index_h \\\n\ + ) \\\n\ +{ \\\n\ + RESIZE_CUBIC_PART0; \\\n\ + vxc_half4 src0_h,src1_h,src2_h,src3_h; \\\n\ + vxc_short4 src0,src1,src2,src3; \\\n\ + for (i = 0; i < 4; i++) \\\n\ + { \\\n\ + coord_in.x = read_imagei(index_w, coord_index.xw).x; \\\n\ + cubic_coeffs_x = vload4(i, scale_x); \\\n\ + coord_index.x = coord_index.x + 1; \\\n\ + \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + _viv_asm(COPY, src0_h, src0, 8); \\\n\ + _viv_asm(COPY, src1_h, src1, 8); \\\n\ + _viv_asm(COPY, src2_h, src2, 8); \\\n\ + _viv_asm(COPY, src3_h, src3, 8); \\\n\ + \\\n\ + VXC_DP4x4(src0_f, src0_h, src0_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src1_f, src1_h, src1_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src2_f, src2_h, src2_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src3_f, src3_h, src3_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + \\\n\ + dst = src0_f * cubic_coeffs_y.x \\\n\ + + src1_f * cubic_coeffs_y.y \\\n\ + + src2_f * cubic_coeffs_y.z \\\n\ + + src3_f * cubic_coeffs_y.w; \\\n\ + sum[i] = dot(dst, cubic_coeffs_x); \\\n\ + sum[i] = sum[i] * output_scale + output_tail; \\\n\ + } \\\n\ + float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \\\n\ + int4 tmpout; \\\n\ + _viv_asm(CONV,tmpout,sum_f); \\\n\ + dst_type out; \\\n\ + VXC_DP2x8(out, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\ + \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +RESIZE_CUBIC_F16toQuant8(F16toU8,vxc_uchar4)\n\ +RESIZE_CUBIC_F16toQuant8(F16toI8,vxc_char4)\n\ +\n\ +#define RESIZE_CUBIC_Quant8toF16(name,src_type) \\\n\ +__kernel void resize_cubic_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + __read_only image2d_t scale_w, \\\n\ + __read_only image2d_t scale_h, \\\n\ + __read_only image2d_t index_w, \\\n\ + __read_only image2d_t index_h \\\n\ + ) \\\n\ +{ \\\n\ + RESIZE_CUBIC_PART0; \\\n\ + src_type src0,src1,src2,src3; \\\n\ + for (i = 0; i < 4; i++) \\\n\ + { \\\n\ + coord_in.x = read_imagei(index_w, coord_index.xw).x; \\\n\ + cubic_coeffs_x = vload4(i, scale_x); \\\n\ + coord_index.x = coord_index.x + 1; \\\n\ + \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src2_f, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + VXC_DP4x4(src3_f, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\ + \\\n\ + src0_f = src0_f * input_scale + input_tail; \\\n\ + src1_f = src1_f * input_scale + input_tail; \\\n\ + src2_f = src2_f * input_scale + input_tail; \\\n\ + src3_f = src3_f * input_scale + input_tail; \\\n\ + \\\n\ + dst = src0_f * cubic_coeffs_y.x \\\n\ + + src1_f * cubic_coeffs_y.y \\\n\ + + src2_f * cubic_coeffs_y.z \\\n\ + + src3_f * cubic_coeffs_y.w; \\\n\ + sum[i] = dot(dst, cubic_coeffs_x); \\\n\ + } \\\n\ + float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \\\n\ + half4 tmpout; \\\n\ + _viv_asm(CONV,tmpout,sum_f); \\\n\ + vxc_half4 out_h; \\\n\ + vxc_short4 out; \\\n\ + VXC_DP2x8(out_h, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\ + _viv_asm(COPY, out, out_h, 8); \\\n\ + \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +RESIZE_CUBIC_Quant8toF16(U8toF16,vxc_uchar4)\n\ +RESIZE_CUBIC_Quant8toF16(I8toF16,vxc_char4)"; /* end of resize_cubic_vx*/ + static const char resize_nearest_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\ @@ -49593,6 +53322,379 @@ SCATTER_ND_UPDATE_COPY(F16, vxc_short8, 2, short)\n\ SCATTER_ND_UPDATE_COPY(BF16, vxc_short8, 2, short)\n\ "; /* end of scatter_nd_update_qint_vx*/ +static const char scatter_nd_update_reduction_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int update_width;\n\ +_viv_uniform int output_width;\n\ +\n\ +_viv_uniform int4 coord_stride;\n\ +_viv_uniform int4 coord_stride1;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +\n\ +_viv_uniform int input_zp;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int update_zp;\n\ +_viv_uniform float update_scale;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndU8SubZpToFp32_4x4;\n\ +\n\ +inline void AtomicAdd_float(volatile __global float *source, const float operand)\n\ +{\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } newVal;\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } prevVal;\n\ + do\n\ + {\n\ + prevVal.floatVal = *source;\n\ + newVal.floatVal = prevVal.floatVal + operand;\n\ + } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\ + prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\ +}\n\ +\n\ +inline void AtomicMul_float(volatile __global float *source, const float operand)\n\ +{\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } newVal;\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } prevVal;\n\ + do\n\ + {\n\ + prevVal.floatVal = *source;\n\ + newVal.floatVal = prevVal.floatVal * operand;\n\ + } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\ + prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\ +}\n\ +\n\ +inline void AtomicMax_float(volatile __global float *source, const float operand)\n\ +{\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } newVal;\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } prevVal;\n\ + do\n\ + {\n\ + prevVal.floatVal = *source;\n\ + newVal.floatVal = fmax(prevVal.floatVal, operand);\n\ + } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\ + prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\ +}\n\ +\n\ +inline void AtomicMin_float(volatile __global float *source, const float operand)\n\ +{\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } newVal;\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } prevVal;\n\ + do\n\ + {\n\ + prevVal.floatVal = *source;\n\ + newVal.floatVal = fmin(prevVal.floatVal, operand);\n\ + } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\ + prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\ +}\n\ +\n\ +#define SCATTER_REDUCTION_PREPROCESS(name0, ptr0, type0, len0, size0, ptr2) \\\n\ +__kernel void scatter_nd_update_reduction_preprocess_##name0( \\\n\ + __read_only image2d_t input_ref, \\\n\ + image2d_t temp_buf_float, \\\n\ + int length, int res) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + Image img1 = create_image_from_image2d(input_ref, size0); \\\n\ + Image img2 = create_image_from_image2d(temp_buf_float, 4); \\\n\ + __global float* tmp_ref_ptr = (__global float*)img2.ptr; \\\n\ + type0 src; \\\n\ + float4 tmpDst0, tmpDst1; \\\n\ + short zp = input_zp; \\\n\ + if(length > 0) \\\n\ + { \\\n\ + __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \\\n\ + ptr0 tmpData = input_ptr[gidx]; \\\n\ + int loc2 = gidx * 8; \\\n\ + _viv_asm(COPY, src, tmpData, len0); \\\n\ + VXC_DP4x4(tmpDst0, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpDst1, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvert2ndU8SubZpToFp32_4x4); \\\n\ + tmpDst0 *= input_scale; \\\n\ + tmpDst1 *= input_scale; \\\n\ + vstore4(tmpDst0, 0, tmp_ref_ptr + loc2); \\\n\ + vstore4(tmpDst1, 1, tmp_ref_ptr + loc2); \\\n\ + } \\\n\ + __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \\\n\ + for(int i = gidx; i < res; i += get_global_size(0)) \\\n\ + { \\\n\ + ptr2 tmpData1 = input_ptr1[length + i]; \\\n\ + _viv_asm(COPY, src, tmpData1, 4); \\\n\ + VXC_DP4x4(tmpDst0, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + tmp_ref_ptr[length + i] = tmpDst0.x; \\\n\ + } \\\n\ +}\n\ +SCATTER_REDUCTION_PREPROCESS(U8, vxc_uchar8, vxc_uchar8, 8, 1, uchar)\n\ +SCATTER_REDUCTION_PREPROCESS(I8, vxc_char8, vxc_char8, 8, 1, char)\n\ +SCATTER_REDUCTION_PREPROCESS(I16, vxc_short8, vxc_short8, 16, 2, short)\n\ +SCATTER_REDUCTION_PREPROCESS(F16, vxc_short8, vxc_half8, 16, 2, short)\n\ +\n\ +#define SCATTER_ND_REDUCTION_PROCESS_F16(name0, func) \\\n\ +__kernel void scatter_nd_update_reduction_##name0##_F16( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + image2d_t temp_buf_float, \\\n\ + image2d_t link_buffer0, \\\n\ + int width, int area, int vol, int val4, \\\n\ + int val5, int val6, int val7, int coord_dim) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(update, 2); \\\n\ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global short* update_ptr = (__global short*)img2.ptr; \\\n\ + __global float* output_ptr = (__global float*)img3.ptr; \\\n\ + half src; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + short tmpData = update_ptr[gidy * update_width + gidx]; \\\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\ + int loc = idx * output_width + gidx; \\\n\ + _viv_asm(COPY, src, tmpData, 4); \\\n\ + float data; \\\n\ + _viv_asm(CONV, data, src); \\\n\ + func(output_ptr + loc, data); \\\n\ +}\n\ +SCATTER_ND_REDUCTION_PROCESS_F16(Add, AtomicAdd_float)\n\ +SCATTER_ND_REDUCTION_PROCESS_F16(Mul, AtomicMul_float)\n\ +SCATTER_ND_REDUCTION_PROCESS_F16(Max, AtomicMax_float)\n\ +SCATTER_ND_REDUCTION_PROCESS_F16(Min, AtomicMin_float)\n\ +\n\ +#define SCATTER_ND_REDUCTION_PROCESS_BF16(name0, func) \\\n\ +__kernel void scatter_nd_update_reduction_##name0##_BF16( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + image2d_t temp_buf_float, \\\n\ + image2d_t link_buffer0, \\\n\ + int width, int area, int vol, int val4, \\\n\ + int val5, int val6, int val7, int coord_dim) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(update, 2); \\\n\ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global short* update_ptr = (__global short*)img2.ptr; \\\n\ + __global float* output_ptr = (__global float*)img3.ptr; \\\n\ + half src; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + short tmpData = update_ptr[gidy * update_width + gidx]; \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + vxc_short8 src0, src1; \\\n\ + float data; \\\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\ + int loc = idx * output_width + gidx; \\\n\ + _viv_asm(COPY, src0, tmpData, 4); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data, src1, 4); \\\n\ + func(output_ptr + loc, data); \\\n\ +}\n\ +SCATTER_ND_REDUCTION_PROCESS_BF16(Add, AtomicAdd_float)\n\ +SCATTER_ND_REDUCTION_PROCESS_BF16(Mul, AtomicMul_float)\n\ +SCATTER_ND_REDUCTION_PROCESS_BF16(Max, AtomicMax_float)\n\ +SCATTER_ND_REDUCTION_PROCESS_BF16(Min, AtomicMin_float)\n\ +\n\ +#define SCATTER_ND_UPDATE_PROCESS_QINT(name0, src0_type, data_type, ptr_type, element_size, func) \\\n\ +__kernel void scatter_nd_update_reduction_##name0##_##src0_type( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + image2d_t temp_buf_float, \\\n\ + image2d_t link_buffer0, \\\n\ + int width, int area, int vol, int val4, \\\n\ + int val5, int val6, int val7, int coord_dim) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(update, element_size); \\\n\ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\ + __global float* output_ptr = (__global float*)img3.ptr; \\\n\ + data_type src; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \\\n\ + short zp = update_zp; \\\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\ + int loc = idx * output_width + gidx; \\\n\ + _viv_asm(COPY, src, tmpData, 4); \\\n\ + vxc_float4 data; \\\n\ + VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + data.x *= update_scale; \\\n\ + func(output_ptr + loc, data.x); \\\n\ +}\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Add, U8, vxc_uchar8, uchar, 1, AtomicAdd_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, U8, vxc_uchar8, uchar, 1, AtomicMul_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Max, U8, vxc_uchar8, uchar, 1, AtomicMax_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Min, U8, vxc_uchar8, uchar, 1, AtomicMin_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Add, I8, vxc_char8, char, 1, AtomicAdd_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I8, vxc_char8, char, 1, AtomicMul_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Max, I8, vxc_char8, char, 1, AtomicMax_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Min, I8, vxc_char8, char, 1, AtomicMin_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Add, I16, vxc_short8, short, 2, AtomicAdd_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I16, vxc_short8, short, 2, AtomicMul_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Max, I16, vxc_short8, short, 2, AtomicMax_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Min, I16, vxc_short8, short, 2, AtomicMin_float)\n\ +"; /* end of scatter_nd_update_reduction_vx*/ + +static const char scatter_nd_update_reduction_conv_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\ +\n\ +#define SCATTER_ND_UPDATE_CONV(src0_type, ptr_type, element_size, ptr_type1, conv_func) \\\n\ +__kernel void scatter_nd_update_reduction_conv_##src0_type( \\\n\ + __read_only image2d_t temp_buf_float, \\\n\ + __read_only image2d_t link_buf, \\\n\ + image2d_t output, \\\n\ + int length, int res) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + Image img1 = create_image_from_image2d(temp_buf_float, 4); \\\n\ + Image img2 = create_image_from_image2d(output, element_size); \\\n\ + __global float* input_ptr = (__global float*)img1.ptr; \\\n\ + if(length > 0) \\\n\ + { \\\n\ + __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \\\n\ + float4 src0 = vload4(0, input_ptr + gidx * 8); \\\n\ + float4 src1 = vload4(1, input_ptr + gidx * 8); \\\n\ + int4 data0 = convert_int4_rte(src0 * output_scale + output_zp); \\\n\ + int4 data1 = convert_int4_rte(src1 * output_scale + output_zp); \\\n\ + ptr_type dst; \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + output_ptr[gidx] = dst; \\\n\ + } \\\n\ + __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \\\n\ + for(int i = gidx; i < res; i += get_global_size(0)) \\\n\ + { \\\n\ + float src = input_ptr[length + i]; \\\n\ + int data = convert_int_rte(src * output_scale + output_zp); \\\n\ + output_ptr1[length + i] = conv_func(data); \\\n\ + } \\\n\ +}\n\ +SCATTER_ND_UPDATE_CONV(U8, vxc_uchar8, 1, uchar, convert_uchar)\n\ +SCATTER_ND_UPDATE_CONV(I8, vxc_char8, 1, char, convert_char)\n\ +SCATTER_ND_UPDATE_CONV(I16, vxc_short8, 2, short, convert_short)\n\ +\n\ +__kernel void scatter_nd_update_reduction_conv_F16(\n\ + __read_only image2d_t temp_buf_float,\n\ + __read_only image2d_t link_buf,\n\ + image2d_t output,\n\ + int length, int res)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + Image img1 = create_image_from_image2d(temp_buf_float, 4);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + __global float* input_ptr = (__global float*)img1.ptr;\n\ + if(length > 0)\n\ + {\n\ + __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;\n\ + float4 src0 = vload4(0, input_ptr + gidx * 8);\n\ + float4 src1 = vload4(1, input_ptr + gidx * 8);\n\ + half4 data0, data1;\n\ + _viv_asm(CONV, data0, src0);\n\ + _viv_asm(CONV, data1, src1);\n\ + vxc_half8 tmp;\n\ + vxc_short8 dst;\n\ + VXC_DP2x8(tmp, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractHalf8_2x8);\n\ + _viv_asm(COPY, dst, tmp, 16);\n\ + output_ptr[gidx] = dst;\n\ + }\n\ + __global short* output_ptr1 = (__global short*)img2.ptr;\n\ + for(int i = gidx; i < res; i += get_global_size(0))\n\ + {\n\ + float src = input_ptr[length + i];\n\ + half data;\n\ + _viv_asm(CONV, data, src);\n\ + short dst;\n\ + _viv_asm(COPY, dst, data, 4);\n\ + output_ptr1[length + i] = dst;\n\ + }\n\ +}\n\ +\n\ +__kernel void scatter_nd_update_reduction_conv_BF16(\n\ + __read_only image2d_t temp_buf_float,\n\ + __read_only image2d_t link_buf,\n\ + image2d_t output,\n\ + int length, int res)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + Image img1 = create_image_from_image2d(temp_buf_float, 4);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + __global float* input_ptr = (__global float*)img1.ptr;\n\ + if(length > 0)\n\ + {\n\ + __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;\n\ + float4 src0 = vload4(0, input_ptr + gidx * 8);\n\ + float4 src1 = vload4(1, input_ptr + gidx * 8);\n\ + vxc_short8 dst0, dst1, dst;\n\ + _viv_asm(COPY, dst0, src0, 16);\n\ + _viv_asm(COPY, dst1, src1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + output_ptr[gidx] = dst;\n\ + }\n\ + __global short* output_ptr1 = (__global short*)img2.ptr;\n\ + for(int i = gidx; i < res; i += get_global_size(0))\n\ + {\n\ + float src = input_ptr[length + i];\n\ + vxc_short8 data;\n\ + _viv_asm(COPY, data, src, 4);\n\ + output_ptr1[length + i] = data.x;\n\ + }\n\ +}\n\ +"; /* end of scatter_nd_update_reduction_conv_vx*/ + static const char scatter_nd_update_special_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\ @@ -52516,10 +56618,10 @@ do\\\n\ #define VXC_VertMax3_Integer(dst, src0, src1, src2, info)\\\n\ do\\\n\ {\\\n\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ - int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\\\n\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + constant int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\\\n\ typeof (dst) tmp;\\\n\ tmp = max(src0, src1);\\\n\ tmp = max(src2, tmp);\\\n\ @@ -52544,10 +56646,10 @@ do\\\n\ #define VXC_HorzMax3_Integer(dst, src0, info)\\\n\ do\\\n\ {\\\n\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ - int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + constant int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\ VXC_OP4(filter, dst, src0, src0, src0, mod1);\\\n\ } while (0)\n\ @@ -52555,12 +56657,12 @@ do\\\n\ #define VXC_HorzMax3_Half(dst, src0, info)\\\n\ do\\\n\ {\\\n\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ - int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ - int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\ - int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + constant int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ + constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\ + constant int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\ vxc_short8 val0, minVal, maxVal;\\\n\ _viv_asm(COPY, val0, src0, 16);\\\n\ VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\\\n\ @@ -52572,24 +56674,24 @@ do\\\n\ #define VXC_HorzMin3_Integer(dst, src0, info)\\\n\ do\\\n\ {\\\n\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ - int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ - int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + constant int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ + constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\ VXC_OP4(filter, dst, src0, src0, src0, mod1);\\\n\ } while (0)\n\ \n\ #define VXC_HorzMin3_Half(dst, src0, info)\\\n\ do\\\n\ {\\\n\ - int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ - int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ - int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ - int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ - int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\ - int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\ - int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\\\n\ + constant int startBin = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\ + constant int endBin = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\ + constant int sourceBin = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\ + constant int clamp = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\ + constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\ + constant int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\ + constant int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\\\n\ vxc_short8 val0, minVal, maxVal, midVal;\\\n\ _viv_asm(COPY, val0, src0, 16);\\\n\ VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\\\n\ @@ -54928,6 +59030,192 @@ __kernel void clip_U8toF32_2D(\n\ }\n\ "; /* end of clip_U8_cl*/ +static const char crop_and_resize_bilinear_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +\n\ +_viv_uniform float width_scale;\n\ +_viv_uniform float height_scale;\n\ +_viv_uniform int image_width;\n\ +_viv_uniform int image_height;\n\ +\n\ +#define CROP_AND_RESIZE_BILINEAR(name, read_type, dst_type, conv_type, write_type) \\\n\ +__kernel void crop_and_resize_bilinear_##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout, \\\n\ + float inOutScale, \\\n\ + float inOutTile, \\\n\ + float extrapolation_value \\\n\ +) \\\n\ +{ \\\n\ + int bb = get_global_id(2); \\\n\ + int y = get_global_id(1); \\\n\ + int x = get_global_id(0); \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int2 coord_box_ind = (int2)(bb, 0); \\\n\ + int b = read_imagei(box_ind, coord_box_ind).x; \\\n\ + float4 xy; \\\n\ + float in_x, in_y; \\\n\ + int d = 0; \\\n\ + \\\n\ + Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\ + xy = vload_half4(bb, boxes_ptr); \\\n\ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\ + in_y = xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale; \\\n\ + in_x = xy.y * convert_float(image_width - 1) + convert_float(x) * _width_scale; \\\n\ + float y_lerp = in_y - floor(in_y); \\\n\ + float x_lerp = in_x - floor(in_x); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + int4 coord = (int4)(floor(in_x), floor(in_y), d + b * ori_depth, 0); \\\n\ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\ + { \\\n\ + src0 = (float4)(extrapolation_value,0,0,0); \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + src0 = convert_float4(read_type(input, coord)); \\\n\ + } \\\n\ + coord.x = coord.x + 1; \\\n\ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\ + { \\\n\ + src1 = (float4)(extrapolation_value,0,0,0); \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + src1 = convert_float4(read_type(input, coord)); \\\n\ + } \\\n\ + coord.y = coord.y + 1; \\\n\ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\ + { \\\n\ + src3 = (float4)(extrapolation_value,0,0,0); \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + src3 = convert_float4(read_type(input, coord)); \\\n\ + } \\\n\ + coord.x = coord.x - 1; \\\n\ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\ + { \\\n\ + src2 = (float4)(extrapolation_value,0,0,0); \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + src2 = convert_float4(read_type(input, coord)); \\\n\ + } \\\n\ + float4 top = src0 + (src1 - src0) * x_lerp; \\\n\ + float4 bottom = src2 + (src3 - src2) * x_lerp; \\\n\ + float4 value = top + (bottom - top) * y_lerp; \\\n\ + value = value * inOutScale + inOutTile; \\\n\ + dst_type dst = conv_type(value); \\\n\ + coord_out.z = d + coord_out.z * ori_depth; \\\n\ + write_type(output, coord_out, dst); \\\n\ + } \\\n\ +}\n\ +\n\ +CROP_AND_RESIZE_BILINEAR(U32toU32,read_imageui, \\\n\ +uint4, convert_uint4, write_imageui)\n\ +CROP_AND_RESIZE_BILINEAR(U32toF32,read_imageui, \\\n\ +float4,convert_float4,write_imagef)\n\ +CROP_AND_RESIZE_BILINEAR(F32toF32,read_imagef, \\\n\ +float4, convert_float4,write_imagef)\n\ +CROP_AND_RESIZE_BILINEAR(F32toU32,read_imagef, \\\n\ +uint4, convert_uint4, write_imageui)\n\ +CROP_AND_RESIZE_BILINEAR(F32toI32,read_imagef, \\\n\ +int4, convert_int4, write_imagei)\n\ +CROP_AND_RESIZE_BILINEAR(I32toI32,read_imagei, \\\n\ +int4, convert_int4, write_imagei)\n\ +CROP_AND_RESIZE_BILINEAR(I32toF32,read_imagei, \\\n\ +float4,convert_float4,write_imagef)"; /* end of crop_and_resize_bilinear_cl*/ + +static const char crop_and_resize_nearest_neighbor_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +\n\ +_viv_uniform float width_scale;\n\ +_viv_uniform float height_scale;\n\ +_viv_uniform int image_width;\n\ +_viv_uniform int image_height;\n\ +\n\ +#define CROP_AND_RESIZE_NEAREST_NEIGHTBOR(name,src_type, read_type, dst_type, conv_type, write_type) \\\n\ +__kernel void crop_and_resize_nearest_neighbor_##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t boxes, \\\n\ + __read_only image2d_t box_ind, \\\n\ + __write_only image2d_array_t output, \\\n\ + uint ori_depth, \\\n\ + uint ori_batchout, \\\n\ + float inOutScale, \\\n\ + float inOutTile, \\\n\ + float extrapolation_value \\\n\ +) \\\n\ +{ \\\n\ + int bb = get_global_id(2); \\\n\ + int y = get_global_id(1); \\\n\ + int x = get_global_id(0); \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int2 coord_box_ind = (int2)(bb, 0); \\\n\ + int b = read_imagei(box_ind, coord_box_ind).x; \\\n\ + float4 xy; \\\n\ + int in_x, in_y, d = 0; \\\n\ + \\\n\ + Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\ + __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\ + xy = vload_half4(bb, boxes_ptr); \\\n\ + float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\ + float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\ + if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\ + if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\ + in_y = convert_int(round(xy.x * convert_float(image_height - 1) \\\n\ + + convert_float(y) * _height_scale)); \\\n\ + in_x = convert_int(round(xy.y * convert_float(image_width - 1) \\\n\ + + convert_float(x) * _width_scale)); \\\n\ + for (d = 0; d < ori_depth; d++) \\\n\ + { \\\n\ + int4 coord = (int4)(in_x, in_y, d + b * ori_depth, 0); \\\n\ + float4 src_f; \\\n\ + if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\ + { \\\n\ + src_f = (float4)(extrapolation_value, 0, 0, 0); \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + src_type src = read_type(input, coord); \\\n\ + src_f = convert_float4(src); \\\n\ + } \\\n\ + src_f = src_f * inOutScale + inOutTile; \\\n\ + dst_type dst = conv_type(src_f); \\\n\ + coord_out.z = d + coord_out.z * ori_depth; \\\n\ + write_type(output, coord_out, dst); \\\n\ + } \\\n\ +}\n\ +\n\ +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(U32toU32,uint4, \\\n\ +read_imageui, uint4, convert_uint4, write_imageui)\n\ +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(U32toF32,uint4, \\\n\ +read_imageui, float4,convert_float4,write_imagef)\n\ +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toF32,float4, \\\n\ +read_imagef, float4,convert_float4,write_imagef)\n\ +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toU32,float4, \\\n\ +read_imagef, uint4, convert_uint4, write_imageui)\n\ +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toI32,float4, \\\n\ +read_imagef, int4, convert_int4, write_imagei)\n\ +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(I32toI32,int4, \\\n\ +read_imagei, int4, convert_int4, write_imagei)\n\ +CROP_AND_RESIZE_NEAREST_NEIGHTBOR(I32toF32,int4, \\\n\ +read_imagei, float4,convert_float4,write_imagef)"; /* end of crop_and_resize_nearest_neighbor_cl*/ + static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ @@ -56334,6 +60622,11 @@ float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)\n\ return log(x1 / x2);\n\ }\n\ \n\ +float eltwise_unary_tan(float x, float alpha, float beta)\n\ +{\n\ + return native_tan(x);\n\ +}\n\ +\n\ \n\ #define ELTWISE_UNARY_F32_2D(func_name) \\\n\ __kernel void func_name##_F32toF32_2D \\\n\ @@ -56376,6 +60669,7 @@ ELTWISE_UNARY_F32_2D(atan)\n\ ELTWISE_UNARY_F32_2D(atanh)\n\ ELTWISE_UNARY_F32_2D(acosh)\n\ ELTWISE_UNARY_F32_2D(inverse_sigmoid)\n\ +ELTWISE_UNARY_F32_2D(tan)\n\ \n\ #define ELTWISE_UNARY_U8_2D(func_name) \\\n\ __kernel void func_name##_U8toU8_2D \\\n\ @@ -56419,6 +60713,7 @@ ELTWISE_UNARY_U8_2D(atan)\n\ ELTWISE_UNARY_U8_2D(atanh)\n\ ELTWISE_UNARY_U8_2D(acosh)\n\ ELTWISE_UNARY_U8_2D(inverse_sigmoid)\n\ +ELTWISE_UNARY_U8_2D(tan)\n\ \n\ #define ELTWISE_UNARY_U8toF32_2D(func_name) \\\n\ __kernel void func_name##_U8toF32_2D \\\n\ @@ -56461,6 +60756,7 @@ ELTWISE_UNARY_U8toF32_2D(atan)\n\ ELTWISE_UNARY_U8toF32_2D(atanh)\n\ ELTWISE_UNARY_U8toF32_2D(acosh)\n\ ELTWISE_UNARY_U8toF32_2D(inverse_sigmoid)\n\ +ELTWISE_UNARY_U8toF32_2D(tan)\n\ \n\ __kernel void neg_I32toI32_2D\n\ (\n\ @@ -56660,6 +60956,11 @@ float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)\n\ return log(x1 / x2);\n\ }\n\ \n\ +float eltwise_unary_tan(float x, float alpha, float beta)\n\ +{\n\ + return native_tan(x);\n\ +}\n\ +\n\ #define ELTWISE_UNARY_F32(func_name) \\\n\ __kernel void func_name##_F32toF32 \\\n\ ( \\\n\ @@ -56701,6 +61002,7 @@ ELTWISE_UNARY_F32(atan)\n\ ELTWISE_UNARY_F32(atanh)\n\ ELTWISE_UNARY_F32(acosh)\n\ ELTWISE_UNARY_F32(inverse_sigmoid)\n\ +ELTWISE_UNARY_F32(tan)\n\ \n\ #define ELTWISE_UNARY_U8(func_name) \\\n\ __kernel void func_name##_U8toU8 \\\n\ @@ -56744,6 +61046,7 @@ ELTWISE_UNARY_U8(atan)\n\ ELTWISE_UNARY_U8(atanh)\n\ ELTWISE_UNARY_U8(acosh)\n\ ELTWISE_UNARY_U8(inverse_sigmoid)\n\ +ELTWISE_UNARY_U8(tan)\n\ \n\ #define ELTWISE_UNARY_U8toF32(func_name) \\\n\ __kernel void func_name##_U8toF32 \\\n\ @@ -56786,6 +61089,7 @@ ELTWISE_UNARY_U8toF32(atan)\n\ ELTWISE_UNARY_U8toF32(atanh)\n\ ELTWISE_UNARY_U8toF32(acosh)\n\ ELTWISE_UNARY_U8toF32(inverse_sigmoid)\n\ +ELTWISE_UNARY_U8toF32(tan)\n\ \n\ __kernel void neg_I32toI32\n\ (\n\ @@ -59180,7 +63484,8 @@ __kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \\\n\ __read_only image2d_t hstate_h_conv, \\\n\ __write_only image2d_t output, \\\n\ __write_only image2d_t hstate_out, \\\n\ - float input_scale, float input_tail, float output_scale, float output_zp) \\\n\ + float input_scale, float input_tail, float output_scale, float output_zp, \\\n\ + float output_scale1, float output_zp1) \\\n\ { \\\n\ int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ float4 src0, src1, src2, src3; \\\n\ @@ -59197,10 +63502,12 @@ __kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \\\n\ z.x = act_func(z.x); \\\n\ h = tanh_func(h.x); \\\n\ float4 dst = (1 - z ) * h + z * h_tm; \\\n\ - dst = dst * output_scale + output_zp; \\\n\ - uint4 result = convert_uint4_sat_rte(dst); \\\n\ + float4 out0 = dst * output_scale + output_zp; \\\n\ + float4 out1 = dst * output_scale1 + output_zp1; \\\n\ + uint4 result = convert_uint4_sat_rte(out0); \\\n\ + uint4 result1 = convert_uint4_sat_rte(out1); \\\n\ write_imageui(output, coord_in.xy, result); \\\n\ - write_imageui(hstate_out, coord_in.xy, result); \\\n\ + write_imageui(hstate_out, coord_in.xy, result1); \\\n\ }\n\ GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)\n\ //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\ @@ -59214,7 +63521,8 @@ __kernel void grucell_activation_z_h_F32_F32toF32_##act_name( \\\n\ __read_only image2d_t hstate_h_conv, \\\n\ __write_only image2d_t output, \\\n\ __write_only image2d_t hstate_out, \\\n\ - float input_scale, float input_tail, float output_scale, float output_zp) \\\n\ + float input_scale, float input_tail, float output_scale, float output_zp, \\\n\ + float output_scale1, float output_zp1) \\\n\ { \\\n\ int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ float4 src0, src1, src2, src3; \\\n\ @@ -59246,7 +63554,8 @@ __kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \\\n\ __read_only image2d_t hstate_h_conv, \\\n\ __write_only image2d_t output, \\\n\ __write_only image2d_t hstate_out, \\\n\ - float input_scale, float input_tail, float output_scale, float output_zp) \\\n\ + float input_scale, float input_tail, float output_scale, float output_zp, \\\n\ + float output_scale1, float output_zp1) \\\n\ { \\\n\ int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ float4 src0, src1, src2, src3; \\\n\ @@ -59263,13 +63572,16 @@ __kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \\\n\ z.x = act_func(z.x); \\\n\ h = tanh_func(h.x); \\\n\ float4 dst = (1 - z ) * h + z * h_tm; \\\n\ - dst = dst * output_scale + output_zp; \\\n\ - int4 result = convert_int4_sat_rte(dst); \\\n\ + float4 out0 = dst * output_scale + output_zp; \\\n\ + float4 out1 = dst * output_scale1 + output_zp1; \\\n\ + int4 result = convert_int4_sat_rte(out0); \\\n\ + int4 result1 = convert_int4_sat_rte(out1); \\\n\ write_imagei(output, coord_in.xy, result); \\\n\ - write_imagei(hstate_out, coord_in.xy, result); \\\n\ + write_imagei(hstate_out, coord_in.xy, result1); \\\n\ }\n\ GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)\n\ -//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_activation_z_h_cl*/ +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of grucell_activation_z_h_cl*/ static const char grucell_h_times_activation_r_cl[] = "#define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ @@ -59382,6 +63694,12 @@ float tanh_func(float x)\n\ return 2 * x - 1;\n\ }\n\ \n\ +float relu_func(float x)\n\ +{\n\ + x = x > 0 ? x : 0;\n\ + return x;\n\ +}\n\ +\n\ \n\ #define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \\\n\ __kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \\\n\ @@ -59423,6 +63741,7 @@ __kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \\\n\ }\n\ GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)\n\ //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\ +GRUCELL_ACTIVATION_U8_F32_U8(RELU, relu_func)\n\ \n\ #define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \\\n\ __kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \\\n\ @@ -59462,6 +63781,7 @@ __kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \\\n\ \n\ GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid)\n\ //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\ +GRUCELL_ACTIVATION_F32_F32_F32(RELU, relu_func)\n\ \n\ #define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \\\n\ __kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \\\n\ @@ -59502,7 +63822,8 @@ __kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \\\n\ write_imagei(hstate_out, coord_in.xy, result); \\\n\ }\n\ GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)\n\ -//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_reset_after_activation_cl*/ +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\ +GRUCELL_ACTIVATION_I32_F32_I32(RELU, relu_func)"; /* end of grucell_reset_after_activation_cl*/ static const char hswish_cl[] = "#define HSWISH_F32_F32_PROCESS() \\\n\ float4 src, tmp, dst; \\\n\ @@ -61968,6 +66289,349 @@ __kernel void log_softmax_axis2_BF16toBF16\n\ #undef rlogE\n\ "; /* end of log_softmax_axis2_cl*/ +static const char log_softmax_exceed_axis0_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +\n\ +\n\ +#define rlogE (0.693147182f)\n\ +float LOG(float x)\n\ +{\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +__kernel void log_softmax_exceed_axis0_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int z = get_global_id(1);\n\ + int4 coord_in = (int4)(0, 0, z, 0);\n\ + float4 maxValue;\n\ + float4 src, dst = {0.0};\n\ +\n\ + maxValue = read_imagef(input, coord_in);\n\ + for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\ + {\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\ + {\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\ + {\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + write_imagef(output, coord_in, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_exceed_axis0_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int z = get_global_id(1);\n\ + int4 coord_in = (int4)(0, 0, z, 0);\n\ + float4 maxValue;\n\ + float4 src;\n\ + uint4 dst = {0};\n\ +\n\ + maxValue = convert_float4(read_imageui(input, coord_in));\n\ + for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\ + {\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\ + {\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\ + {\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ +\n\ + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);\n\ +\n\ + write_imageui(output, coord_in, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +\n\ +__kernel void log_softmax_exceed_axis0_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int z = get_global_id(1);\n\ + int4 coord_in = (int4)(0, 0, z, 0);\n\ + float4 maxValue, src, dst = {0.0};\n\ + uint4 data, val, out;\n\ +\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, maxValue, data, 16);\n\ + for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\ + {\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ + }\n\ +\n\ + float sum = 0.f;\n\ + for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\ + {\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ + }\n\ +\n\ + float logSum = LOG(sum);\n\ + for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\ + {\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + _viv_asm(COPY, val, dst, 16);\n\ + out = val >> 16;\n\ + write_imageui(output, coord_in, out);\n\ + }\n\ + }\n\ +}\n\ +#undef rlogE\n\ +"; /* end of log_softmax_exceed_axis0_cl*/ + +static const char log_softmax_exceed_axis1_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int height;\n\ +_viv_uniform int depth;\n\ +\n\ +#define rlogE (0.693147182f)\n\ +\n\ +float LOG(float x)\n\ +{\n\ + x = log2(x);\n\ + return x * rlogE;\n\ +}\n\ +\n\ +__kernel void log_softmax_exceed_axis1_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int4 coord_in = (int4)(x, 0, 0, 0);\n\ + float4 maxValue;\n\ + float4 src, dst = {0.0};\n\ +\n\ + maxValue = read_imagef(input, coord_in);\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\ + {\n\ + src = read_imagef(input, coord_in);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + write_imagef(output, coord_in, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_exceed_axis1_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int4 coord_in = (int4)(x, 0, 0, 0);\n\ + float4 maxValue;\n\ + float4 src;\n\ + uint4 dst = {0};\n\ +\n\ + maxValue = convert_float4(read_imageui(input, coord_in));\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\ + {\n\ + src = convert_float4(read_imageui(input, coord_in));\n\ +\n\ + dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);\n\ +\n\ + write_imageui(output, coord_in, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_exceed_axis1_BF16oBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int4 coord_in = (int4)(x, 0, 0, 0);\n\ + float4 maxValue, src, dst = {0.0};\n\ + uint4 data, val, out;\n\ +\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, maxValue, data, 16);\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ + }\n\ +\n\ + float sum = 0.f;\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ + }\n\ +\n\ + float logSum = LOG(sum);\n\ + for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\ + {\n\ + for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ +\n\ + _viv_asm(COPY, val, dst, 16);\n\ + out = val >> 16;\n\ +\n\ + write_imageui(output, coord_in, out);\n\ + }\n\ + }\n\ +}\n\ +\n\ +#undef rlogE\n\ +"; /* end of log_softmax_exceed_axis1_cl*/ + static const char logical_not_cl[] = "__kernel void logical_not_I8toI8(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output)\n\ @@ -65593,7 +70257,135 @@ __kernel void gemm_4x_transa_F32F32toF32_2D(\n\ \n\ }\n\ \n\ +__kernel __attribute__((reqd_work_group_size(1, 64, 1)))\n\ + void gemm_4x_transa_local_F32F32toF32_2D(\n\ + __read_only image2d_t inputA,\n\ + __read_only image2d_t inputB,\n\ + __write_only image2d_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ + )\n\ +{\n\ + int offset0 = get_global_id(0);\n\ + int lid = get_local_id(1);\n\ \n\ + int stride = 0;\n\ +\n\ + int z = 0;\n\ + int offset1 = M << 2;\n\ + int step = K >> 8;\n\ + int lid2 = lid * 4 * step;\n\ +\n\ + Image in0_tensor = create_image_from_image2d(inputA, 4);\n\ + __global float* in0_ptr0 = (__global float*)in0_tensor.ptr + offset0 + lid2 * M;\n\ + __global float* in0_ptr1 = in0_ptr0 + M;\n\ + __global float* in0_ptr2 = in0_ptr1 + M;\n\ + __global float* in0_ptr3 = in0_ptr2 + M;\n\ +\n\ + Image in1_tensor = create_image_from_image2d(inputB, 4);\n\ + __global float* in1_ptr = (__global float*)in1_tensor.ptr + lid2;\n\ +\n\ + Image o_tensor = create_image_from_image2d(output, 4);\n\ + __global float* output_ptr = (__global float*)o_tensor.ptr + offset0;\n\ +\n\ + __local float4 sum_vec4_0[64];\n\ + __local float4 sum_vec4_1[64];\n\ + __local float4 sum_vec4_2[64];\n\ + __local float4 sum_vec4_3[64];\n\ +\n\ + float4 sum0 = (float4)(0.0, 0.0, 0.0, 0.0);\n\ + float4 sum1 = (float4)(0.0, 0.0, 0.0, 0.0);\n\ + float4 sum2 = (float4)(0.0, 0.0, 0.0, 0.0);\n\ + float4 sum3 = (float4)(0.0, 0.0, 0.0, 0.0);\n\ +\n\ + float4 tempA0, tempA1, tempA2, tempA3;\n\ + float4 tempA4, tempA5, tempA6, tempA7;\n\ + float4 tempB0;\n\ +\n\ + for(z = 0; z < step; z++)\n\ + {\n\ + tempB0 = vload4(z, in1_ptr);\n\ + tempA0 = vload4(0, in0_ptr0);\n\ + tempA1 = vload4(0, in0_ptr1);\n\ + tempA2 = vload4(0, in0_ptr2);\n\ + tempA3 = vload4(0, in0_ptr3);\n\ + tempA4 = vload4(1, in0_ptr0);\n\ + tempA5 = vload4(1, in0_ptr1);\n\ + tempA6 = vload4(1, in0_ptr2);\n\ + tempA7 = vload4(1, in0_ptr3);\n\ +\n\ + sum0 = sum0 + tempA0 * tempB0.x;\n\ + sum0 = sum0 + tempA1 * tempB0.y;\n\ + sum0 = sum0 + tempA2 * tempB0.z;\n\ + sum0 = sum0 + tempA3 * tempB0.w;\n\ + sum1 = sum1 + tempA4 * tempB0.x;\n\ + sum1 = sum1 + tempA5 * tempB0.y;\n\ + sum1 = sum1 + tempA6 * tempB0.z;\n\ + sum1 = sum1 + tempA7 * tempB0.w;\n\ +\n\ + tempA0 = vload4(2, in0_ptr0);\n\ + tempA1 = vload4(2, in0_ptr1);\n\ + tempA2 = vload4(2, in0_ptr2);\n\ + tempA3 = vload4(2, in0_ptr3);\n\ + tempA4 = vload4(3, in0_ptr0);\n\ + tempA5 = vload4(3, in0_ptr1);\n\ + tempA6 = vload4(3, in0_ptr2);\n\ + tempA7 = vload4(3, in0_ptr3);\n\ +\n\ + in0_ptr0 = in0_ptr0 + offset1;\n\ + in0_ptr1 = in0_ptr1 + offset1;\n\ + in0_ptr2 = in0_ptr2 + offset1;\n\ + in0_ptr3 = in0_ptr3 + offset1;\n\ +\n\ + sum2 = sum2 + tempA0 * tempB0.x;\n\ + sum2 = sum2 + tempA1 * tempB0.y;\n\ + sum2 = sum2 + tempA2 * tempB0.z;\n\ + sum2 = sum2 + tempA3 * tempB0.w;\n\ + sum3 = sum3 + tempA4 * tempB0.x;\n\ + sum3 = sum3 + tempA5 * tempB0.y;\n\ + sum3 = sum3 + tempA6 * tempB0.z;\n\ + sum3 = sum3 + tempA7 * tempB0.w;\n\ + }\n\ + sum_vec4_0[lid] = sum0;\n\ + sum_vec4_1[lid] = sum1;\n\ + sum_vec4_2[lid] = sum2;\n\ + sum_vec4_3[lid] = sum3;\n\ +\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + for (stride = 32; stride > 0; stride >>= 1)\n\ + {\n\ + if (lid < stride)\n\ + {\n\ + sum_vec4_0[lid] += sum_vec4_0[lid + stride];\n\ + sum_vec4_1[lid] += sum_vec4_1[lid + stride];\n\ + sum_vec4_2[lid] += sum_vec4_2[lid + stride];\n\ + sum_vec4_3[lid] += sum_vec4_3[lid + stride];\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + }\n\ +\n\ + if (lid == 0)\n\ + {\n\ + sum0 = sum_vec4_0[0];\n\ + sum1 = sum_vec4_1[0];\n\ + sum2 = sum_vec4_2[0];\n\ + sum3 = sum_vec4_3[0];\n\ + vstore4(sum0, 0, output_ptr);\n\ + vstore4(sum1, 1, output_ptr);\n\ + vstore4(sum2, 2, output_ptr);\n\ + vstore4(sum3, 3, output_ptr);\n\ + }\n\ +}\n\ \n\ "; /* end of matrixmul_4x_cl*/ @@ -71548,6 +76340,203 @@ __kernel void resize_bilinear_U8toU8(\n\ }\n\ "; /* end of resize_bilinear_cl*/ +static const char resize_cubic_cl[] = "__kernel void resize_cubic_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float half_pixel_value\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float cubic_coeffs_y[4] = {0,0,0,0};\n\ + float cubic_coeffs_x[4] = {0,0,0,0};\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float left_x_f = floor(in_x);\n\ + float4 delta_x = (float4)(0, in_x - left_x_f,0,0);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float4 delta_y = (float4)(0, in_y - top_y_f,0,0);\n\ + int x_idx = convert_int(left_x_f - 1);\n\ + int y_idx = convert_int(top_y_f - 1);\n\ + int4 coord_in = (int4)(x_idx, y_idx, coord_out.z, 0);\n\ + float data00, data01, data02, data03, data10, data11, data12, data13,\n\ + data20, data21, data22, data23, data30, data31, data32, data33;\n\ +\n\ + delta_x.x = 1 + delta_x.y;\n\ + delta_x.z = 1 - delta_x.y;\n\ + delta_x.w = 2 - delta_x.y;\n\ + cubic_coeffs_x[0] = -0.5 * ((((delta_x.x - 5) * delta_x.x + 8) * delta_x.x) - 4);\n\ + cubic_coeffs_x[1] = (1.5 * delta_x.y - 2.5) * delta_x.y * delta_x.y + 1;\n\ + cubic_coeffs_x[2] = (1.5 * delta_x.z - 2.5) * delta_x.z * delta_x.z + 1;\n\ + cubic_coeffs_x[3] = -0.5 * ((((delta_x.w - 5) * delta_x.w + 8) * delta_x.w) - 4);\n\ + delta_y.x = 1 + delta_y.y;\n\ + delta_y.z = 1 - delta_y.y;\n\ + delta_y.w = 2 - delta_y.y;\n\ + cubic_coeffs_y[0] = -0.5 * ((((delta_y.x - 5) * delta_y.x + 8) * delta_y.x) - 4);\n\ + cubic_coeffs_y[1] = (1.5 * delta_y.y - 2.5) * delta_y.y * delta_y.y + 1;\n\ + cubic_coeffs_y[2] = (1.5 * delta_y.z - 2.5) * delta_y.z * delta_y.z + 1;\n\ + cubic_coeffs_y[3] = -0.5 * ((((delta_y.w - 5) * delta_y.w + 8) * delta_y.w) - 4);\n\ + float4 dst = (float4)(0,0,0,0);\n\ +\n\ + data00 = read_imagef(input, coord_in).x;\n\ + coord_in.x++;\n\ + data10 = read_imagef(input, coord_in).x;\n\ + coord_in.x++;\n\ + data20 = read_imagef(input, coord_in).x;\n\ + coord_in.x++;\n\ + data30 = read_imagef(input, coord_in).x;\n\ +\n\ + coord_in.y++;\n\ + data31 = read_imagef(input, coord_in).x;\n\ + coord_in.x--;\n\ + data21 = read_imagef(input, coord_in).x;\n\ + coord_in.x--;\n\ + data11 = read_imagef(input, coord_in).x;\n\ + coord_in.x--;\n\ + data01 = read_imagef(input, coord_in).x;\n\ +\n\ + coord_in.y++;\n\ + data02 = read_imagef(input, coord_in).x;\n\ + coord_in.x++;\n\ + data12 = read_imagef(input, coord_in).x;\n\ + coord_in.x++;\n\ + data22 = read_imagef(input, coord_in).x;\n\ + coord_in.x++;\n\ + data32 = read_imagef(input, coord_in).x;\n\ +\n\ + coord_in.y++;\n\ + data33 = read_imagef(input, coord_in).x;\n\ + coord_in.x--;\n\ + data23 = read_imagef(input, coord_in).x;\n\ + coord_in.x--;\n\ + data13 = read_imagef(input, coord_in).x;\n\ + coord_in.x--;\n\ + data03 = read_imagef(input, coord_in).x;\n\ +\n\ + dst.x = data00 * cubic_coeffs_x[0] * cubic_coeffs_y[0]\n\ + + data01 * cubic_coeffs_x[0] * cubic_coeffs_y[1]\n\ + + data02 * cubic_coeffs_x[0] * cubic_coeffs_y[2]\n\ + + data03 * cubic_coeffs_x[0] * cubic_coeffs_y[3]\n\ + + data10 * cubic_coeffs_x[1] * cubic_coeffs_y[0]\n\ + + data11 * cubic_coeffs_x[1] * cubic_coeffs_y[1]\n\ + + data12 * cubic_coeffs_x[1] * cubic_coeffs_y[2]\n\ + + data13 * cubic_coeffs_x[1] * cubic_coeffs_y[3]\n\ + + data20 * cubic_coeffs_x[2] * cubic_coeffs_y[0]\n\ + + data21 * cubic_coeffs_x[2] * cubic_coeffs_y[1]\n\ + + data22 * cubic_coeffs_x[2] * cubic_coeffs_y[2]\n\ + + data23 * cubic_coeffs_x[2] * cubic_coeffs_y[3]\n\ + + data30 * cubic_coeffs_x[3] * cubic_coeffs_y[0]\n\ + + data31 * cubic_coeffs_x[3] * cubic_coeffs_y[1]\n\ + + data32 * cubic_coeffs_x[3] * cubic_coeffs_y[2]\n\ + + data33 * cubic_coeffs_x[3] * cubic_coeffs_y[3];\n\ +\n\ + write_imagef(output, coord_out, dst);\n\ +\n\ +}\n\ +\n\ +\n\ +__kernel void resize_cubic_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float half_pixel_value,\n\ + float in_scale,\n\ + float in_tail,\n\ + float out_scale,\n\ + float out_tail\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float cubic_coeffs_y[4] = {0,0,0,0};\n\ + float cubic_coeffs_x[4] = {0,0,0,0};\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float left_x_f = floor(in_x);\n\ + float4 delta_x = (float4)(0, in_x - left_x_f,0,0);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;\n\ + float top_y_f = floor(in_y);\n\ + float4 delta_y = (float4)(0, in_y - top_y_f,0,0);\n\ + int x_idx = convert_int(left_x_f - 1);\n\ + int y_idx = convert_int(top_y_f - 1);\n\ + int4 coord_in = (int4)(x_idx, y_idx, coord_out.z, 0);\n\ + float data00, data01, data02, data03, data10, data11, data12, data13,\n\ + data20, data21, data22, data23, data30, data31, data32, data33;\n\ +\n\ + delta_x.x = 1 + delta_x.y;\n\ + delta_x.z = 1 - delta_x.y;\n\ + delta_x.w = 2 - delta_x.y;\n\ + cubic_coeffs_x[0] = -0.5 * ((((delta_x.x - 5) * delta_x.x + 8) * delta_x.x) - 4);\n\ + cubic_coeffs_x[1] = (1.5 * delta_x.y - 2.5) * delta_x.y * delta_x.y + 1;\n\ + cubic_coeffs_x[2] = (1.5 * delta_x.z - 2.5) * delta_x.z * delta_x.z + 1;\n\ + cubic_coeffs_x[3] = -0.5 * ((((delta_x.w - 5) * delta_x.w + 8) * delta_x.w) - 4);\n\ + delta_y.x = 1 + delta_y.y;\n\ + delta_y.z = 1 - delta_y.y;\n\ + delta_y.w = 2 - delta_y.y;\n\ + cubic_coeffs_y[0] = -0.5 * ((((delta_y.x - 5) * delta_y.x + 8) * delta_y.x) - 4);\n\ + cubic_coeffs_y[1] = (1.5 * delta_y.y - 2.5) * delta_y.y * delta_y.y + 1;\n\ + cubic_coeffs_y[2] = (1.5 * delta_y.z - 2.5) * delta_y.z * delta_y.z + 1;\n\ + cubic_coeffs_y[3] = -0.5 * ((((delta_y.w - 5) * delta_y.w + 8) * delta_y.w) - 4);\n\ + float dst = 0;\n\ + uint4 out = (uint4)(0,0,0,0);\n\ +\n\ + data00 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x++;\n\ + data10 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x++;\n\ + data20 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x++;\n\ + data30 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ +\n\ + coord_in.y++;\n\ + data31 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x--;\n\ + data21 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x--;\n\ + data11 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x--;\n\ + data01 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ +\n\ + coord_in.y++;\n\ + data02 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x++;\n\ + data12 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x++;\n\ + data22 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x++;\n\ + data32 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ +\n\ + coord_in.y++;\n\ + data33 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x--;\n\ + data23 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x--;\n\ + data13 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ + coord_in.x--;\n\ + data03 = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\ +\n\ + dst = data00 * cubic_coeffs_x[0] * cubic_coeffs_y[0]\n\ + + data01 * cubic_coeffs_x[0] * cubic_coeffs_y[1]\n\ + + data02 * cubic_coeffs_x[0] * cubic_coeffs_y[2]\n\ + + data03 * cubic_coeffs_x[0] * cubic_coeffs_y[3]\n\ + + data10 * cubic_coeffs_x[1] * cubic_coeffs_y[0]\n\ + + data11 * cubic_coeffs_x[1] * cubic_coeffs_y[1]\n\ + + data12 * cubic_coeffs_x[1] * cubic_coeffs_y[2]\n\ + + data13 * cubic_coeffs_x[1] * cubic_coeffs_y[3]\n\ + + data20 * cubic_coeffs_x[2] * cubic_coeffs_y[0]\n\ + + data21 * cubic_coeffs_x[2] * cubic_coeffs_y[1]\n\ + + data22 * cubic_coeffs_x[2] * cubic_coeffs_y[2]\n\ + + data23 * cubic_coeffs_x[2] * cubic_coeffs_y[3]\n\ + + data30 * cubic_coeffs_x[3] * cubic_coeffs_y[0]\n\ + + data31 * cubic_coeffs_x[3] * cubic_coeffs_y[1]\n\ + + data32 * cubic_coeffs_x[3] * cubic_coeffs_y[2]\n\ + + data33 * cubic_coeffs_x[3] * cubic_coeffs_y[3];\n\ + out.x = convert_uint(dst * out_scale + out_tail);\n\ +\n\ + write_imageui(output, coord_out, out);\n\ +}\n\ +"; /* end of resize_cubic_cl*/ + static const char resize_nearest_cl[] = "\n\ #define NEAREST_INDEX_PROCESS() \\\n\ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -73239,6 +78228,284 @@ SCATTER_ND_UPDATE(I32, int4, read_imagei, write_imagei)\n\ SCATTER_ND_UPDATE(F32, float4, read_imagef, write_imagef)\n\ "; /* end of scatter_nd_update_cl*/ +static const char scatter_nd_update_reduction_cl[] = "\n\ +inline void AtomicAdd_float(volatile __global float *source, const float operand)\n\ +{\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } newVal;\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } prevVal;\n\ + do\n\ + {\n\ + prevVal.floatVal = *source;\n\ + newVal.floatVal = prevVal.floatVal + operand;\n\ + } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\ + prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\ +}\n\ +\n\ +inline void AtomicMul_float(volatile __global float *source, const float operand)\n\ +{\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } newVal;\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } prevVal;\n\ + do\n\ + {\n\ + prevVal.floatVal = *source;\n\ + newVal.floatVal = prevVal.floatVal * operand;\n\ + } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\ + prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\ +}\n\ +\n\ +inline void AtomicMax_float(volatile __global float *source, const float operand)\n\ +{\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } newVal;\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } prevVal;\n\ + do\n\ + {\n\ + prevVal.floatVal = *source;\n\ + newVal.floatVal = fmax(prevVal.floatVal, operand);\n\ + } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\ + prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\ +}\n\ +\n\ +inline void AtomicMin_float(volatile __global float *source, const float operand)\n\ +{\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } newVal;\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } prevVal;\n\ + do\n\ + {\n\ + prevVal.floatVal = *source;\n\ + newVal.floatVal = fmin(prevVal.floatVal, operand);\n\ + } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\ + prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\ +}\n\ +\n\ +#define SCATTER_REDUCTION_PREPROCESS(name0, ptr0, type0, size0, ptr2) \\\n\ +__kernel void scatter_nd_update_reduction_preprocess_##name0( \\\n\ + __read_only image2d_t input_ref, \\\n\ + image2d_t temp_buf_float, \\\n\ + int length, int res, float input_scale, float zp_scale) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + Image img1 = create_image_from_image2d(input_ref, size0); \\\n\ + Image img2 = create_image_from_image2d(temp_buf_float, 4); \\\n\ + __global float* tmp_ref_ptr = (__global float*)img2.ptr; \\\n\ + type0 src0, src1; \\\n\ + float4 tmpDst0, tmpDst1; \\\n\ + __global ptr2* input_ptr = (__global ptr2*)img1.ptr; \\\n\ + if(length > 0) \\\n\ + { \\\n\ + int loc2 = gidx * 8; \\\n\ + ptr0 tmpData0 = vload4(0, input_ptr + loc2); \\\n\ + ptr0 tmpData1 = vload4(1, input_ptr + loc2); \\\n\ + _viv_asm(COPY, src0, tmpData0, 16); \\\n\ + _viv_asm(COPY, src1, tmpData1, 16); \\\n\ + _viv_asm(CONV, tmpDst0, src0); \\\n\ + _viv_asm(CONV, tmpDst1, src1); \\\n\ + tmpDst0 = tmpDst0 * input_scale + zp_scale; \\\n\ + tmpDst1 = tmpDst1 * input_scale + zp_scale; \\\n\ + vstore4(tmpDst0, 0, tmp_ref_ptr + loc2); \\\n\ + vstore4(tmpDst1, 1, tmp_ref_ptr + loc2); \\\n\ + } \\\n\ + for(int i = gidx; i < res; i += get_global_size(0)) \\\n\ + { \\\n\ + ptr2 tmpData0 = input_ptr[length + i]; \\\n\ + _viv_asm(COPY, src0, tmpData0, 4); \\\n\ + _viv_asm(CONV, tmpDst0, src0); \\\n\ + tmpDst0.x = tmpDst0.x * input_scale + zp_scale; \\\n\ + tmp_ref_ptr[length + i] = tmpDst0.x; \\\n\ + } \\\n\ +}\n\ +SCATTER_REDUCTION_PREPROCESS(U8, uchar4, uchar4, 1, uchar)\n\ +SCATTER_REDUCTION_PREPROCESS(I8, char4, char4, 1, char)\n\ +SCATTER_REDUCTION_PREPROCESS(I16, short4, short4, 2, short)\n\ +SCATTER_REDUCTION_PREPROCESS(F16, short4, half4, 2, short)\n\ +SCATTER_REDUCTION_PREPROCESS(F32, float4, float4, 4, float)\n\ +\n\ +#define SCATTER_ND_REDUCTION_PROCESS_F16(name0, func) \\\n\ +__kernel void scatter_nd_update_reduction_##name0##_F16( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + image2d_t temp_buf_float, \\\n\ + image2d_t link_buffer0, \\\n\ + int val0, int val1, int val2, int val3, int val4, int val5, int val6, \\\n\ + int coord_dim, int update_width, int output_width, float update_scale, float zp_scale) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(update, 2); \\\n\ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global short* update_ptr = (__global short*)img2.ptr; \\\n\ + __global float* output_ptr = (__global float*)img3.ptr; \\\n\ + half src; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + short tmpData = update_ptr[gidy * update_width + gidx]; \\\n\ + int idx = indice.x * val0 + indice.y * val1 + indice.z * val2 + indice.w * val3; \\\n\ + idx = idx + indice1.x * val4 + indice1.y * val5 + indice1.z * val6; \\\n\ + int loc = idx * output_width + gidx; \\\n\ + _viv_asm(COPY, src, tmpData, 4); \\\n\ + float data; \\\n\ + _viv_asm(CONV, data, src); \\\n\ + func(output_ptr + loc, data); \\\n\ +}\n\ +SCATTER_ND_REDUCTION_PROCESS_F16(Add, AtomicAdd_float)\n\ +SCATTER_ND_REDUCTION_PROCESS_F16(Mul, AtomicMul_float)\n\ +SCATTER_ND_REDUCTION_PROCESS_F16(Max, AtomicMax_float)\n\ +SCATTER_ND_REDUCTION_PROCESS_F16(Min, AtomicMin_float)\n\ +\n\ +#define SCATTER_ND_UPDATE_PROCESS_QINT(name0, src0_type, ptr_type, element_size, func) \\\n\ +__kernel void scatter_nd_update_reduction_##name0##_##src0_type( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + image2d_t temp_buf_float, \\\n\ + image2d_t link_buffer0, \\\n\ + int val0, int val1, int val2, int val3, int val4, int val5, int val6, \\\n\ + int coord_dim, int update_width, int output_width, float update_scale, float zp_scale) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(update, element_size); \\\n\ + Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\ + __global float* output_ptr = (__global float*)img3.ptr; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \\\n\ + int idx = indice.x * val0 + indice.y * val1 + indice.z * val2 + indice.w * val3; \\\n\ + idx = idx + indice1.x * val4 + indice1.y * val5 + indice1.z * val6; \\\n\ + int loc = idx * output_width + gidx; \\\n\ + float data; \\\n\ + _viv_asm(CONV, data, tmpData); \\\n\ + data = data * update_scale + zp_scale; \\\n\ + func(output_ptr + loc, data); \\\n\ +}\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Add, U8, uchar, 1, AtomicAdd_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, U8, uchar, 1, AtomicMul_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Max, U8, uchar, 1, AtomicMax_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Min, U8, uchar, 1, AtomicMin_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Add, I8, char, 1, AtomicAdd_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I8, char, 1, AtomicMul_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Max, I8, char, 1, AtomicMax_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Min, I8, char, 1, AtomicMin_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Add, I16, short, 2, AtomicAdd_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I16, short, 2, AtomicMul_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Max, I16, short, 2, AtomicMax_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Min, I16, short, 2, AtomicMin_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Add, F32, float, 4, AtomicAdd_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Mul, F32, float, 4, AtomicMul_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Max, F32, float, 4, AtomicMax_float)\n\ +SCATTER_ND_UPDATE_PROCESS_QINT(Min, F32, float, 4, AtomicMin_float)"; /* end of scatter_nd_update_reduction_cl*/ + +static const char scatter_nd_update_reduction_conv_cl[] = "__kernel void scatter_nd_update_reduction_conv_F16(\n\ + __read_only image2d_t temp_buf_float,\n\ + __read_only image2d_t link_buf,\n\ + image2d_t output,\n\ + int length, int res, float output_scale, float output_zp)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + Image img1 = create_image_from_image2d(temp_buf_float, 4);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + __global float* input_ptr = (__global float*)img1.ptr;\n\ + __global short* output_ptr = (__global short*)img2.ptr;\n\ + if(length > 0)\n\ + {\n\ + int offset = gidx * 8;\n\ + float4 src0 = vload4(0, input_ptr + offset);\n\ + float4 src1 = vload4(1, input_ptr + offset);\n\ + half4 data0, data1;\n\ + _viv_asm(CONV, data0, src0);\n\ + _viv_asm(CONV, data1, src1);\n\ + short4 dst0, dst1;\n\ + _viv_asm(COPY, dst0, data0, 16);\n\ + _viv_asm(COPY, dst1, data1, 16);\n\ + vstore4(dst0, 0, output_ptr + offset);\n\ + vstore4(dst1, 1, output_ptr + offset);\n\ + }\n\ + for(int i = gidx; i < res; i += get_global_size(0))\n\ + {\n\ + float src = input_ptr[length + i];\n\ + half data;\n\ + _viv_asm(CONV, data, src);\n\ + short dst;\n\ + _viv_asm(COPY, dst, data, 4);\n\ + output_ptr[length + i] = dst;\n\ + }\n\ +}\n\ +\n\ +#define SCATTER_ND_UPDATE_CONV(src0_type, ptr_type, element_size, ptr_type1, conv_func) \\\n\ +__kernel void scatter_nd_update_reduction_conv_##src0_type( \\\n\ + __read_only image2d_t temp_buf_float, \\\n\ + __read_only image2d_t link_buf, \\\n\ + image2d_t output, \\\n\ + int length, int res, float output_scale, float output_zp) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + Image img1 = create_image_from_image2d(temp_buf_float, 4); \\\n\ + Image img2 = create_image_from_image2d(output, element_size); \\\n\ + __global float* input_ptr = (__global float*)img1.ptr; \\\n\ + __global ptr_type1* output_ptr = (__global ptr_type1*)img2.ptr; \\\n\ + if(length > 0) \\\n\ + { \\\n\ + int offset = gidx * 8; \\\n\ + float4 src0 = vload4(0, input_ptr + offset); \\\n\ + float4 src1 = vload4(1, input_ptr + offset); \\\n\ + int4 data0 = convert_int4_rte(src0 * output_scale + output_zp); \\\n\ + int4 data1 = convert_int4_rte(src1 * output_scale + output_zp); \\\n\ + ptr_type dst0, dst1; \\\n\ + _viv_asm(CONV, dst0, data0); \\\n\ + _viv_asm(CONV, dst1, data1); \\\n\ + vstore4(dst0, 0, output_ptr + offset); \\\n\ + vstore4(dst1, 1, output_ptr + offset); \\\n\ + } \\\n\ + for(int i = gidx; i < res; i += get_global_size(0)) \\\n\ + { \\\n\ + float src = input_ptr[length + i]; \\\n\ + int data = convert_int_rte(src * output_scale + output_zp); \\\n\ + output_ptr[length + i] = conv_func(data); \\\n\ + } \\\n\ +}\n\ +SCATTER_ND_UPDATE_CONV(U8, uchar4, 1, uchar, convert_uchar)\n\ +SCATTER_ND_UPDATE_CONV(I8, char4, 1, char, convert_char)\n\ +SCATTER_ND_UPDATE_CONV(I16, short4, 2, short, convert_short)\n\ +SCATTER_ND_UPDATE_CONV(F32, float4, 4, float, convert_float)\n\ +"; /* end of scatter_nd_update_reduction_conv_cl*/ + static const char select_cl[] = "__kernel void select_I8_U8_U8toU8(\n\ __read_only image2d_array_t condition,\n\ __read_only image2d_array_t input0,\n\ @@ -73818,7 +79085,7 @@ __kernel void swish_I32toI32_2D(\n\ src = read_imagef(input, coord); \\\n\ tmp.x = sigmoid_(src.x * beta, logE); \\\n\ data.x = src.x * tmp.x; \\\n\ - uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ + uint4 dst = convert_uint4_rte(data * outputScale + outputZP); \\\n\ write_imageui(output, coord, dst);\n\ \n\ __kernel void swish_F32toU8(\n\ @@ -75205,6 +80472,8 @@ static const source_map_t evis_resource[] = {"clip_U8_vx", clip_U8_vx}, {"conv1d_ovxlib_vx", conv1d_ovxlib_vx}, {"conv1d_ovxlib_k1024_vx", conv1d_ovxlib_k1024_vx}, + {"crop_and_resize_bilinear_vx", crop_and_resize_bilinear_vx}, + {"crop_and_resize_nearest_neighbor_vx", crop_and_resize_nearest_neighbor_vx}, {"cumsum_vx", cumsum_vx}, {"cumsum_2d_vx", cumsum_2d_vx}, {"cumsum_bf16_vx", cumsum_bf16_vx}, @@ -75273,11 +80542,20 @@ static const source_map_t evis_resource[] = {"layer_normalization_1_vx", layer_normalization_1_vx}, {"layer_normalization_2_vx", layer_normalization_2_vx}, {"layer_normalization_3_vx", layer_normalization_3_vx}, + {"layer_normalization_axis01_0_vx", layer_normalization_axis01_0_vx}, + {"layer_normalization_axis01_1_vx", layer_normalization_axis01_1_vx}, + {"layer_normalization_axis01_2_vx", layer_normalization_axis01_2_vx}, + {"layer_normalization_axis01_3_vx", layer_normalization_axis01_3_vx}, + {"layer_normalization_axis01_sum_vx", layer_normalization_axis01_sum_vx}, {"log_softmax_axis0_vx", log_softmax_axis0_vx}, {"log_softmax_axis0_BF16_vx", log_softmax_axis0_BF16_vx}, {"log_softmax_axis1_vx", log_softmax_axis1_vx}, {"log_softmax_axis1_BF16_vx", log_softmax_axis1_BF16_vx}, {"log_softmax_axis2_vx", log_softmax_axis2_vx}, + {"log_softmax_exceed_axis0_vx", log_softmax_exceed_axis0_vx}, + {"log_softmax_exceed_axis0_BF16_vx", log_softmax_exceed_axis0_BF16_vx}, + {"log_softmax_exceed_axis1_vx", log_softmax_exceed_axis1_vx}, + {"log_softmax_exceed_axis1_BF16_vx", log_softmax_exceed_axis1_BF16_vx}, {"logical_not_vx", logical_not_vx}, {"logical_ops_vx", logical_ops_vx}, {"lstmunit_activation_BP_BF16_vx", lstmunit_activation_BP_BF16_vx}, @@ -75361,6 +80639,8 @@ static const source_map_t evis_resource[] = {"pre_process_gray_2_vx", pre_process_gray_2_vx}, {"pre_process_gray_copy_vx", pre_process_gray_copy_vx}, {"pre_process_nv12_copy_vx", pre_process_nv12_copy_vx}, + {"pre_process_nv12_rggb_copy_vx", pre_process_nv12_rggb_copy_vx}, + {"pre_process_nv12_rggb_scale_vx", pre_process_nv12_rggb_scale_vx}, {"pre_process_nv12_scale_vx", pre_process_nv12_scale_vx}, {"pre_process_rgb_vx", pre_process_rgb_vx}, {"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx}, @@ -75420,6 +80700,7 @@ static const source_map_t evis_resource[] = {"resize_bilinear_F16_vx", resize_bilinear_F16_vx}, {"resize_bilinear_I16_vx", resize_bilinear_I16_vx}, {"resize_bilinear_I8_vx", resize_bilinear_I8_vx}, + {"resize_bilinear_U16_vx", resize_bilinear_U16_vx}, {"resize_bilinear_U8_vx", resize_bilinear_U8_vx}, {"resize_bilinear_U8_half_pixel_centers_1_vx", resize_bilinear_U8_half_pixel_centers_1_vx}, {"resize_bilinear_U8_half_pixel_centers_2_vx", resize_bilinear_U8_half_pixel_centers_2_vx}, @@ -75427,6 +80708,7 @@ static const source_map_t evis_resource[] = {"resize_bilinear_align_corners_vx", resize_bilinear_align_corners_vx}, {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx}, {"resize_bilinear_nhwc_bound_vx", resize_bilinear_nhwc_bound_vx}, + {"resize_cubic_vx", resize_cubic_vx}, {"resize_nearest_vx", resize_nearest_vx}, {"scatter_nd_vx", scatter_nd_vx}, {"scatter_nd_big_vx", scatter_nd_big_vx}, @@ -75435,6 +80717,8 @@ static const source_map_t evis_resource[] = {"scatter_nd_update_big_vx", scatter_nd_update_big_vx}, {"scatter_nd_update_fp_vx", scatter_nd_update_fp_vx}, {"scatter_nd_update_qint_vx", scatter_nd_update_qint_vx}, + {"scatter_nd_update_reduction_vx", scatter_nd_update_reduction_vx}, + {"scatter_nd_update_reduction_conv_vx", scatter_nd_update_reduction_conv_vx}, {"scatter_nd_update_special_vx", scatter_nd_update_special_vx}, {"select_vx", select_vx}, {"sequence_mask_vx", sequence_mask_vx}, @@ -75475,6 +80759,8 @@ static const source_map_t cl_resource[] = {"clip_F32_cl", clip_F32_cl}, {"clip_I32_cl", clip_I32_cl}, {"clip_U8_cl", clip_U8_cl}, + {"crop_and_resize_bilinear_cl", crop_and_resize_bilinear_cl}, + {"crop_and_resize_nearest_neighbor_cl", crop_and_resize_nearest_neighbor_cl}, {"cumsum_cl", cumsum_cl}, {"cumsum_2d_cl", cumsum_2d_cl}, {"depth2space_crd_cl", depth2space_crd_cl}, @@ -75511,6 +80797,8 @@ static const source_map_t cl_resource[] = {"log_softmax_axis0_cl", log_softmax_axis0_cl}, {"log_softmax_axis1_cl", log_softmax_axis1_cl}, {"log_softmax_axis2_cl", log_softmax_axis2_cl}, + {"log_softmax_exceed_axis0_cl", log_softmax_exceed_axis0_cl}, + {"log_softmax_exceed_axis1_cl", log_softmax_exceed_axis1_cl}, {"logical_not_cl", logical_not_cl}, {"logical_ops_cl", logical_ops_cl}, {"lppool_cl", lppool_cl}, @@ -75581,6 +80869,7 @@ static const source_map_t cl_resource[] = {"resize_3d_bilinear_cl", resize_3d_bilinear_cl}, {"resize_3d_nearest_cl", resize_3d_nearest_cl}, {"resize_bilinear_cl", resize_bilinear_cl}, + {"resize_cubic_cl", resize_cubic_cl}, {"resize_nearest_cl", resize_nearest_cl}, {"reversesequence_cl", reversesequence_cl}, {"roi_align_cl", roi_align_cl}, @@ -75589,6 +80878,8 @@ static const source_map_t cl_resource[] = {"scatter_elements_mul_cl", scatter_elements_mul_cl}, {"scatter_nd_cl", scatter_nd_cl}, {"scatter_nd_update_cl", scatter_nd_update_cl}, + {"scatter_nd_update_reduction_cl", scatter_nd_update_reduction_cl}, + {"scatter_nd_update_reduction_conv_cl", scatter_nd_update_reduction_conv_cl}, {"select_cl", select_cl}, {"sequence_mask_cl", sequence_mask_cl}, {"signal_frame_cl", signal_frame_cl}, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c index 6252e4d..b2188f1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c @@ -38,60 +38,6 @@ #define _INPUT_NUM (4) #define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) - -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_FAILURE; - - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, - "axis_aligned_bbox_transform", - inputs, _INPUT_NUM, - outputs, _OUTPUT_NUM, NULL ); - - if ( self->n ) - { - status = VSI_SUCCESS; - } - - return status; -} /* op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - /*TODO: Check tensor shapes. */ - VSI_UNREFERENCED(self); - VSI_UNREFERENCED(inputs); - VSI_UNREFERENCED(outputs); - return TRUE; -} /* op_check() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - VSI_UNREFERENCED(self); - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) - { - outputs[0]->attr.size[0] = inputs[1]->attr.size[0]; - outputs[0]->attr.size[1] = inputs[1]->attr.size[1]; - outputs[0]->attr.dim_num = 2; - } - return TRUE; -} /* op_setup() */ #ifdef __cplusplus extern "C" { @@ -101,10 +47,10 @@ DEF_OP_REG ( /* op_name */ AXIS_ALIGNED_BBOX_TRANSFORM, /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, - /* setup */ op_setup, + /* compute */ NULL, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ NULL, /* optimize */ NULL, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c index 7afa231..47ccd79 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c @@ -370,6 +370,20 @@ static vsi_bool op_setup curr->inputs[LSTMUNIT_INPUT_LAYERNORM_C] = inputs[BI_LSTM_FW_INPUT_LAYERNORM_C]; curr->inputs[LSTMUNIT_INPUT_LAYERNORM_O] = inputs[BI_LSTM_FW_INPUT_LAYERNORM_O]; + if (self->input.num > LSTM_INPUT_BIAS_R2I ) + { + curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = inputs[BI_LSTM_FW_INPUT_BIAS_R2I]; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = inputs[BI_LSTM_FW_INPUT_BIAS_R2F]; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = inputs[BI_LSTM_FW_INPUT_BIAS_R2C]; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = inputs[BI_LSTM_FW_INPUT_BIAS_R2O]; + } + else + { + curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = NULL; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = NULL; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = NULL; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = NULL; + } if (has_aux_input) { curr->inputs[LSTM_INPUT_AUX_INPUT] = aux_reshape_output_tensors[i]; @@ -475,6 +489,21 @@ static vsi_bool op_setup curr->inputs[LSTMUNIT_INPUT_LAYERNORM_C] = inputs[BI_LSTM_BW_INPUT_LAYERNORM_C]; curr->inputs[LSTMUNIT_INPUT_LAYERNORM_O] = inputs[BI_LSTM_BW_INPUT_LAYERNORM_O]; + if (self->input.num > LSTM_INPUT_BIAS_R2I) + { + curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = inputs[BI_LSTM_BW_INPUT_BIAS_R2I]; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = inputs[BI_LSTM_BW_INPUT_BIAS_R2F]; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = inputs[BI_LSTM_BW_INPUT_BIAS_R2C]; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = inputs[BI_LSTM_BW_INPUT_BIAS_R2O]; + } + else + { + curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = NULL; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = NULL; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = NULL; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = NULL; + } + if (has_aux_input) { curr->inputs[LSTM_INPUT_AUX_INPUT] = aux_reshape_output_tensors[time_step - 1 - i]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c index 34afc98..ecb1640 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c @@ -267,8 +267,11 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { +#define _TENSOR_LEN 64 vsi_status status; vsi_nn_concat_lcl_data * iter; + char tensor_name[_TENSOR_LEN]; + uint32_t sub_id = 0; status = VSI_SUCCESS; self->n = NULL; @@ -282,6 +285,15 @@ static vsi_status op_compute { iter->cp_node = vxTensorCopyNode(self->graph->g, iter->src_tensor, iter->dst_tensor ); + /* Set copy output tensor name */ + memset(tensor_name, 0, sizeof(tensor_name)); + snprintf(tensor_name, sizeof(tensor_name), "uid_%u_sub_id_%u_out_0", self->uid, sub_id); + if(vxSetReferenceName((vx_reference)iter->dst_tensor, tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u copy node output name fail", self->uid); + return VSI_FAILURE; + } + sub_id++; if( NULL == iter->cp_node ) { VSILOGE( "Create vxTensorCopyNode fail." ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c index 3a31d44..235ab87 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c @@ -39,6 +39,19 @@ #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" +static vsi_status reshape_activation_output + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * in_tensor, + vsi_nn_tensor_t * out_tensor + ) +{ + vsi_nn_rnn_create_reshape(self, in_tensor, out_tensor, + out_tensor->attr.size, out_tensor->attr.dim_num, TRUE); + + return VSI_SUCCESS; +} /* reshape_activation_output() */ + static vsi_nn_internal_tensor_t * reshape_tensor_to_act ( vsi_nn_node_t* self, @@ -350,16 +363,19 @@ static vsi_bool op_setup ) { uint32_t i; + vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t * input_conv_outputs[CONV2D_LSTM_CELL_GATE_NUM] = { NULL }; vsi_nn_internal_tensor_t * recurrent_conv_outputs[CONV2D_LSTM_CELL_GATE_NUM] = { NULL }; vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_tensor_t * reshape_cell_in = NULL; - vsi_nn_internal_tensor_t * reshape_out = NULL; - vsi_nn_internal_tensor_t * reshape_h_out = NULL; - vsi_nn_internal_tensor_t * reshape_c_out = NULL; + vsi_nn_internal_tensor_t * act_out = NULL; + vsi_nn_internal_tensor_t * act_h_out = NULL; + vsi_nn_internal_tensor_t * act_c_out = NULL; vsi_nn_conv2d_lstm_cell_param * p = &self->nn_param.conv2d_lstm_cell; vsi_bool ret = FALSE; + vsi_status status = VSI_FAILURE; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); /* compute output tensor's shapes */ @@ -410,18 +426,27 @@ static vsi_bool op_setup curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = input_conv_outputs[i]->t; curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = recurrent_conv_outputs[i]->t; } - reshape_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]); - CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_out, curr, "Create internal tensor failed", final); - reshape_h_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_H_STATE]); - CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_h_out, curr, "Create internal tensor failed", final); - reshape_c_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_C_STATE]); - CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_c_out, curr, "Create internal tensor failed", final); - curr->outputs[LSTMUNIT_ACT_OUTPUT] = reshape_out->t; - curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = reshape_c_out->t; - curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = reshape_h_out->t; + // create activation output/hstate_output/cstate_output + vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dtype, TRUE); + act_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.dtype, TRUE); + act_h_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.dtype, TRUE); + act_c_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + curr->outputs[LSTMUNIT_ACT_OUTPUT] = act_out->t; + curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = act_h_out->t; + curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = act_c_out->t; vsi_nn_internal_setup_node(self, curr); + // reshape activation output(2d) to conv2d_lstm_cell output(4d) + status = reshape_activation_output(self, act_out->t, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]); + CHECK_STATUS_FAIL_GOTO(status, final); + status = reshape_activation_output(self, act_h_out->t, outputs[CONV2D_LSTM_CELL_OUT_H_STATE]); + CHECK_STATUS_FAIL_GOTO(status, final); + status = reshape_activation_output(self, act_c_out->t, outputs[CONV2D_LSTM_CELL_OUT_C_STATE]); + CHECK_STATUS_FAIL_GOTO(status, final); + ret = TRUE; final: return ret; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c index 85d35df..f51d471 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c @@ -37,294 +37,6 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status; - vx_nn_convolution_relu_pooling_params_ext2_t p; - status = VSI_FAILURE; - - if(vsi_nn_InitConvReluPoolParameter(self, &p, FALSE) != VSI_SUCCESS) - { - VSILOGE("SetConvReluParameter fail\n"); - return VSI_FAILURE; - } - - self->n = vxConvolutionReluPoolingLayer2( - self->graph->g, - inputs[0]->t, - inputs[1]->wb, - (vx_nn_convolution_relu_pooling_params_t *)&p, - sizeof(p), - outputs[0]->t - ); - - vsi_nn_DeinitConvReluPoolParameter( &p ); - - if( NULL != self->n ) - { - status = VSI_SUCCESS; - } - return status; -} /* op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_bool ret = FALSE; - BEGIN_IO_TYPE_DECL(CONV_RELU, 3, 1) - IO_TYPE(D_F16, D_F16, D_NONE, D_F16) - IO_TYPE(D_F16, D_F16, D_F32, D_F16) - IO_TYPE(D_F16, D_F16, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) - IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) - IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) - IO_TYPE(D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F32, D_BF16) - IO_TYPE(D_F32, D_F32, D_NONE, D_F32) - END_IO_TYPE_DECL(CONV_RELU) - if (!VALIDATE_OP_IO_TYPES(CONV_RELU, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - - /* Check fl and scale*/ - ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); - - return ret; -} /* op_check() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_bool ret = FALSE; - -#ifdef VX_CONVERT_POLICY_WRAP_ENABLE - if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) - { - self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; - } -#endif - - ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, outputs ); - - return ret; -} /* op_setup() */ - -static vsi_status op_optimize - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_opt_direction_e direction - ) -{ - vsi_status status; - vx_nn_convolution_relu_pooling_params_ext2_t p; - vx_weights_biases_parameter_optimizations_t opt; - vx_weights_biases_parameter_optimizations_t * p_opt; - - status = VSI_SUCCESS; - - if(direction == VSI_NN_OPTIMIZE_BACKWARD) - { - return VSI_SUCCESS; - } - - VSILOGD("Optimize %s", vsi_nn_OpGetName(self->op)); - /* Prepare weight_bias */ - if(inputs[1]->wb == NULL) - { - if(vsi_nn_InitConvReluPoolParameter(self, &p, FALSE) != VSI_SUCCESS) - { - VSILOGE("SetConvReluParameter fail\n"); - return VSI_FAILURE; - } - - p_opt = NULL; - if( outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC - || inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - memset( &opt, 0, sizeof( opt ) ); - opt.inputZeroPoint = inputs[0]->attr.dtype.zero_point; - opt.zrl = -1; - opt.outputFormat = outputs[0]->attr.dtype.vx_type; - p_opt = &opt; - } - -#ifdef VSI_40BIT_VA_SUPPORT - { - vx_size size_input0[VSI_NN_MAX_DIM_NUM]; - vx_size size_output0[VSI_NN_MAX_DIM_NUM]; - size_t i = 0; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - size_input0[i] = (vx_size)inputs[0]->attr.size[i]; - size_output0[i] = (vx_size)outputs[0]->attr.size[i]; - } - inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( - VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, - 4, - size_input0, - size_output0, - size_output0, - outputs[0]->attr.dtype.vx_type, - (vx_nn_convolution_relu_pooling_params_t *)&p, - sizeof(p), - p_opt, - inputs[1]->t, inputs[2]->t - ); - } -#else - { - uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM]; - uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM]; - size_t i = 0; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i]; - size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i]; - } - inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( - VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, - 4, - size_u32_input0, - size_u32_output0, - size_u32_output0, - outputs[0]->attr.dtype.vx_type, - (vx_nn_convolution_relu_pooling_params_t *)&p, - sizeof(p), - p_opt, - inputs[1]->t, inputs[2]->t - ); - } -#endif - vsi_nn_DeinitConvReluPoolParameter( &p ); - } - - if( NULL == inputs[1]->wb ) - { - VSILOGE( "Create weight bias fail." ); - status = VSI_FAILURE; - } - - return status; -} /* op_optimize() */ - -vsi_status vsi_nn_InitConvReluPoolParameter - ( - vsi_nn_node_t * node, - vx_nn_convolution_relu_pooling_params_ext2_t * param_ext2, - vsi_bool has_pool - ) -{ - int32_t pad_const_val; - vx_scalar pad_const; - vx_nn_convolution_relu_pooling_params_t *param; - vx_nn_convolution_relu_pooling_params_ext_t *param_ext; - - pad_const_val = 0; - pad_const = NULL; - param = NULL; - - if( NULL == node || NULL == param_ext2 ) - { - VSILOGE("Set param fail\n"); - return VSI_FAILURE; - } - memset( param_ext2, 0, sizeof( vx_nn_convolution_relu_pooling_params_ext2_t ) ); - param_ext = ¶m_ext2->ext; - param = ¶m_ext->base; - - pad_const = vxCreateScalar( node->graph->ctx->c, VX_TYPE_INT32, &pad_const_val ); - if( NULL == pad_const ) - { - VSILOGE("Create scalar fail\n"); - return VSI_FAILURE; - } - - if( node->nn_param.conv2d.dilation[0] > 0 ) - { - param->dilation_x = node->nn_param.conv2d.dilation[0] - 1; - } - if( node->nn_param.conv2d.dilation[1] > 0 ) - { - param->dilation_y = node->nn_param.conv2d.dilation[1] - 1; - } - param->pad_x_left = node->nn_param.conv2d.pad[0]; - param->pad_x_right = node->nn_param.conv2d.pad[1]; - param->pad_y_top = node->nn_param.conv2d.pad[2]; - param->pad_y_bottom = node->nn_param.conv2d.pad[3]; - param->accumulator_bits = (vx_uint8)node->vx_param.accumulator_bits; - param->overflow_policy = node->vx_param.overflow_policy; - param->rounding_policy = node->vx_param.rounding_policy; - param->down_scale_size_rounding = node->vx_param.down_scale_size_rounding; - param->enable_relu = (vx_bool)node->vx_param.has_relu; - param->pad_mode = VX_PAD_CONSTANT; - param->pad_const = pad_const; - if( TRUE == has_pool ) - { - param->pool_type = node->nn_param.pool.type; - param->pool_size_x = node->nn_param.pool.ksize[0]; - param->pool_size_y = node->nn_param.pool.ksize[1]; - } - param_ext->stride_x = node->nn_param.conv2d.stride[0]; - param_ext->stride_y = node->nn_param.conv2d.stride[1]; - - param_ext2->depth_multiplier = node->nn_param.conv2d.multiplier; - - return VSI_SUCCESS; -} /* vsi_nn_InitConvReluPoolParameter() */ - -void vsi_nn_DeinitConvReluPoolParameter - ( - vx_nn_convolution_relu_pooling_params_ext2_t * param - ) -{ - if( NULL != param ) - { - if( NULL != param->ext.base.pad_const ) - { - vxReleaseScalar( ¶m->ext.base.pad_const ); - } - } -} /* vsi_nn_DeinitConvReluPoolParameter() */ - #ifdef __cplusplus extern "C" { #endif @@ -333,11 +45,11 @@ DEF_OP_REG ( /* op_name */ CONV_RELU, /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, - /* setup */ op_setup, - /* optimize */ op_optimize, + /* compute */ NULL, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ NULL, + /* optimize */ NULL, /* input_num */ 3, /* output_num */ 1 ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c index aef5a68..2ca96c1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c @@ -37,250 +37,6 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status; - vx_nn_convolution_relu_pooling_params_ext2_t p; - status = VSI_FAILURE; - - if(vsi_nn_InitConvReluPoolParameter(self, &p, TRUE) != VSI_SUCCESS) - { - VSILOGE("SetConvReluPoolParameter fail\n"); - return VSI_FAILURE; - } - - self->n = vxConvolutionReluPoolingLayer2( - self->graph->g, - inputs[0]->t, - inputs[1]->wb, - (vx_nn_convolution_relu_pooling_params_t *)&p, - sizeof(p), - outputs[0]->t - ); - - vsi_nn_DeinitConvReluPoolParameter( &p ); - - if( NULL != self->n ) - { - status = VSI_SUCCESS; - } - return status; -} /* op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_bool ret = FALSE; - BEGIN_IO_TYPE_DECL(CONV_RELU_POOL, 3, 1) - IO_TYPE(D_F16, D_F16, D_NONE, D_F16) - IO_TYPE(D_F16, D_F16, D_F32, D_F16) - IO_TYPE(D_F16, D_F16, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) - IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) - IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) - IO_TYPE(D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F32, D_BF16) - IO_TYPE(D_F32, D_F32, D_NONE, D_F32) - END_IO_TYPE_DECL(CONV_RELU_POOL) - if (!VALIDATE_OP_IO_TYPES(CONV_RELU_POOL, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - - /* Check fl and scale*/ - ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); - - return ret; -} /* op_check() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_bool ret; - -#ifdef VX_CONVERT_POLICY_WRAP_ENABLE - if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) - { - self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; - } -#endif - - ret = TRUE; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) - { - ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, outputs ); - if(ret == FALSE) - { - VSILOGE("OpSetup [VSI_NN_OP_CONV2D] fail\n"); - return FALSE; - } - - ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, outputs, outputs ); - if(ret == FALSE) - { - VSILOGE("OpSetup [VSI_NN_OP_POOL] fail\n"); - return FALSE; - } - } - - return ret; -} /* op_setup() */ - -static vsi_status op_optimize - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_opt_direction_e direction - ) -{ - vsi_status status; - vsi_bool ret; - vsi_nn_tensor_prv_t conv_out, *pconv_out; - vx_nn_convolution_relu_pooling_params_ext2_t p; - vx_weights_biases_parameter_optimizations_t opt; - vx_weights_biases_parameter_optimizations_t * p_opt; - ret = FALSE; - status = VSI_FAILURE; - - if(direction == VSI_NN_OPTIMIZE_BACKWARD) - { - return VSI_SUCCESS; - } - - VSILOGD("Optimize %s", vsi_nn_OpGetName(self->op)); - memset(&conv_out, 0, sizeof(vsi_nn_tensor_prv_t)); - pconv_out = &conv_out; - - ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, (vsi_nn_tensor_t**)(&pconv_out) ); - if(ret == FALSE) - { - VSILOGE("OpSetup [VSI_NN_OP_CONV2D] fail\n"); - goto final; - } - - /* Prepare weight_bias */ - if(inputs[1]->wb == NULL) - { - if(vsi_nn_InitConvReluPoolParameter(self, &p, TRUE) != VSI_SUCCESS) - { - VSILOGE("SetConvReluPoolParameter fail\n"); - goto final; - } - - p_opt = NULL; - if( outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC - || inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - memset( &opt, 0, sizeof( opt ) ); - opt.inputZeroPoint = inputs[0]->attr.dtype.zero_point; - opt.zrl = -1; - opt.outputFormat = outputs[0]->attr.dtype.vx_type; - p_opt = &opt; - } - -#ifdef VSI_40BIT_VA_SUPPORT - { - vx_size size_input0[VSI_NN_MAX_DIM_NUM]; - vx_size size_pconv_out[VSI_NN_MAX_DIM_NUM]; - vx_size size_output0[VSI_NN_MAX_DIM_NUM]; - size_t i = 0; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - size_input0[i] = (vx_size)inputs[0]->attr.size[i]; - size_pconv_out[i] = (vx_size)pconv_out->pot.attr.size[i]; - size_output0[i] = (vx_size)outputs[0]->attr.size[i]; - } - inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( - VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, - 4, - size_input0, - size_pconv_out, - size_output0, - outputs[0]->attr.dtype.vx_type, - (vx_nn_convolution_relu_pooling_params_t *)&p, - sizeof(p), - p_opt, - inputs[1]->t, inputs[2]->t - ); - } -#else - { - uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM]; - uint32_t size_u32_pconv_out[VSI_NN_MAX_DIM_NUM]; - uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM]; - size_t i = 0; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i]; - size_u32_pconv_out[i] = (uint32_t)pconv_out->pot.attr.size[i]; - size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i]; - } - inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( - VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, - 4, - size_u32_input0, - size_u32_pconv_out, - size_u32_output0, - outputs[0]->attr.dtype.vx_type, - (vx_nn_convolution_relu_pooling_params_t *)&p, - sizeof(p), - p_opt, - inputs[1]->t, inputs[2]->t - ); - } -#endif - vsi_nn_DeinitConvReluPoolParameter( &p ); - } - - if( NULL == inputs[1]->wb ) - { - VSILOGE( "Create weight bias fail." ); - } - else - { - status = VSI_SUCCESS; - } - -final: - return status; -} /* op_optimize() */ #ifdef __cplusplus extern "C" { @@ -290,11 +46,11 @@ DEF_OP_REG ( /* op_name */ CONV_RELU_POOL, /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, - /* setup */ op_setup, - /* optimize */ op_optimize, + /* compute */ NULL, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ NULL, + /* optimize */ NULL, /* input_num */ 3, /* output_num */ 1 ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop_and_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop_and_resize.c new file mode 100644 index 0000000..d41e457 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop_and_resize.c @@ -0,0 +1,193 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _crop_and_resize_local_data_t { + int32_t placeholder; +} crop_and_resize_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + + float extrapolation_value = 0; + int32_t resize_method = 0; + + if (NULL == self) + { + return status; + } + + extrapolation_value = self->nn_param.crop_and_resize.extrapolation_value; + resize_method = self->nn_param.crop_and_resize.resize_method; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "resize_method", (int32_t)resize_method ); + vsi_nn_kernel_param_add_float32( param, "extrapolation_value", (float)extrapolation_value ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "crop_and_resize", + inputs, _INPUT_NUM, + outputs, _OUTPUT_NUM, param); + + if ( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(CROP_AND_RESIZE, 3, 1) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_I32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_I32, D_F32) + IO_TYPE(D_F16, D_F16, D_I32, D_F16) + IO_TYPE(D_F16, D_F16, D_I32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F16, D_I32, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I32, D_I16|Q_SYM) + IO_TYPE(D_F32, D_F16, D_I32, D_F32) + IO_TYPE(D_F32, D_F16, D_I32, D_I8|Q_DFP) + IO_TYPE(D_F32, D_F16, D_I32, D_I8|Q_SYM) + IO_TYPE(D_F32, D_F16, D_I32, D_I8|Q_ASYM) + IO_TYPE(D_F32, D_F16, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F16, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F32, D_F16, D_I32, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_F16, D_I32, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_F16, D_I32, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_I32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_I32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I32, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16, D_I32, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_I32, D_F32) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I32, D_F32) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_SYM, D_F16, D_I32, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I32, D_F32) + IO_TYPE(D_I16|Q_SYM, D_F16, D_I32, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16, D_I32, D_F32) + + END_IO_TYPE_DECL(CROP_AND_RESIZE) + if (!VALIDATE_OP_IO_TYPES(CROP_AND_RESIZE, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_crop_and_resize_param * p = NULL; + + p = (vsi_nn_crop_and_resize_param* )&(self->nn_param.crop_and_resize); + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = p->crop_size[1]; + outputs[0]->attr.size[1] = p->crop_size[0]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[2]->attr.size[0]; + } + return TRUE; +} + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + self->nn_param.crop_and_resize.resize_method = VSI_NN_INTERPOLATION_BILINEAR; + self->nn_param.crop_and_resize.extrapolation_value = 0; + + return VSI_SUCCESS; +} /* op_init() */ + + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CROP_AND_RESIZE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c index 6b7cc6f..28556ab 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c @@ -40,6 +40,7 @@ #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) +#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT) static vsi_status op_compute ( vsi_nn_node_t * self, @@ -150,11 +151,25 @@ static vsi_status op_deinit return VSI_SUCCESS; } /* op_deinit() */ - +#endif #ifdef __cplusplus extern "C" { #endif +#if (VX_DEPTH2SPACE_CRD_MODE_SUPPORT) +DEF_OP_REG + ( + /* op_name */ DEPTH2SPACE_INTERNAL, + /* init */ NULL, + /* compute */ NULL, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ NULL, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#else DEF_OP_REG ( /* op_name */ DEPTH2SPACE_INTERNAL, @@ -167,6 +182,7 @@ DEF_OP_REG /* input_num */ 1, /* output_num */ 1 ); +#endif #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c index 726c672..0cd247f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c @@ -38,139 +38,6 @@ #define _INPUT_NUM (3) #define _OUTPUT_NUM (4) -#define _BOX_INPUT_NUM (2) -#define _BOX_OUTPUT_NUM (1) -#define _NMS_INPUT_NUM (2) -#define _NMS_OUTPUT_NUM (4) - -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_FAILURE; - vsi_nn_kernel_param_t * param0 = NULL; - vsi_nn_kernel_param_t * param1 = NULL; - vsi_nn_tensor_t* box_tensors[3] = { NULL }; - vsi_nn_tensor_t* nms_tensors[6] = { NULL }; - vsi_nn_tensor_t* bbox_tensor = NULL; - vsi_nn_tensor_attr_t attr; - vsi_nn_detection_postprocess_param * p = &(self->nn_param.detection_postprocess); - float inv_scale_y, inv_scale_x, inv_scale_h, inv_scale_w; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - - attr.size[0] = 4; - attr.size[1] = inputs[0]->attr.size[1]; - attr.size[2] = inputs[0]->attr.size[2]; - attr.dim_num = 3; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; - attr.is_const = FALSE; - attr.vtl = TRUE; - bbox_tensor = vsi_nn_CreateTensor( self->graph, &attr ); - - inv_scale_y = 1.0f / p->dy; - inv_scale_x = 1.0f / p->dx; - inv_scale_h = 1.0f / p->dh; - inv_scale_w = 1.0f / p->dw; - - if (bbox_tensor) - { - param0 = vsi_nn_kernel_param_create(); - vsi_nn_kernel_param_add_float32( param0, "inv_scale_y", inv_scale_y); - vsi_nn_kernel_param_add_float32( param0, "inv_scale_x", inv_scale_x); - vsi_nn_kernel_param_add_float32( param0, "inv_scale_h", inv_scale_h); - vsi_nn_kernel_param_add_float32( param0, "inv_scale_w", inv_scale_w); - box_tensors[0] = inputs[1]; - box_tensors[1] = inputs[2]; - box_tensors[2] = bbox_tensor; - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "detect_post_box", - &box_tensors[0], _BOX_INPUT_NUM, - &box_tensors[2], _BOX_OUTPUT_NUM, param0 ); - - param1 =vsi_nn_kernel_param_create(); - vsi_nn_kernel_param_add_int32( param1, "nms_type", p->nms_type); - vsi_nn_kernel_param_add_int32( param1, "max_num_detections", p->max_num_detections); - vsi_nn_kernel_param_add_int32( param1, "maximum_class_per_detection", p->maximum_class_per_detection); - vsi_nn_kernel_param_add_int32( param1, "maximum_detection_per_class", p->maximum_detection_per_class); - vsi_nn_kernel_param_add_float32( param1, "score_threshold", p->score_threshold); - vsi_nn_kernel_param_add_float32( param1, "iou_threshold", p->iou_threshold); - vsi_nn_kernel_param_add_int32( param1, "is_bg_in_label", p->is_bg_in_label); - nms_tensors[0] = inputs[0]; - nms_tensors[1] = bbox_tensor; - nms_tensors[2] = outputs[0]; - nms_tensors[3] = outputs[1]; - nms_tensors[4] = outputs[2]; - nms_tensors[5] = outputs[3]; - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "detect_post_nms", - &nms_tensors[0], _NMS_INPUT_NUM, - &nms_tensors[2], _NMS_OUTPUT_NUM, param1 ); - vsi_nn_ReleaseTensor( &bbox_tensor ); - vsi_nn_kernel_param_release( ¶m0 ); - vsi_nn_kernel_param_release( ¶m1 ); - } - if( self->n ) - { - status = VSI_SUCCESS; - } - - return status; -} /* op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - BEGIN_IO_TYPE_DECL(DETECTION_POSTPROCESS, 3, 1) - IO_TYPE(D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - END_IO_TYPE_DECL(DETECTION_POSTPROCESS) - if (!VALIDATE_OP_IO_TYPES(DETECTION_POSTPROCESS, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - return TRUE; -} /* op_check() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) - { - vsi_nn_detection_postprocess_param * p; - p = &(self->nn_param.detection_postprocess); - - outputs[0]->attr.dim_num = 2; - outputs[0]->attr.size[0] = p->max_num_detections; - outputs[0]->attr.size[1] = inputs[0]->attr.size[2]; - - outputs[1]->attr.dim_num = 3; - outputs[1]->attr.size[0] = 4; - outputs[1]->attr.size[1] = p->max_num_detections; - outputs[1]->attr.size[2] = inputs[0]->attr.size[2]; - - outputs[2]->attr.dim_num = 2; - outputs[2]->attr.size[0] = p->max_num_detections; - outputs[2]->attr.size[1] = inputs[0]->attr.size[2]; - - outputs[3]->attr.dim_num = 1; - outputs[3]->attr.size[0] = inputs[0]->attr.size[2]; - } - return TRUE; -} /* op_setup() */ #ifdef __cplusplus extern "C" { @@ -180,10 +47,10 @@ DEF_OP_REG ( /* op_name */ DETECTION_POSTPROCESS, /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, - /* setup */ op_setup, + /* compute */ NULL, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ NULL, /* optimize */ NULL, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c index a7bc5d1..44e051e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -79,8 +79,11 @@ static vsi_status _eltwise_op_compute if ( strcmp(kernel_name, "sub") == 0 || strcmp(kernel_name, "add") == 0 || strcmp(kernel_name, "mul") == 0 - || (strcmp(kernel_name, "maximum") == 0 && !is_executed_on_sh) - || (strcmp(kernel_name, "minimum") == 0 && !is_executed_on_sh) +#if VX_TENSOR_POW_API_SUPPORT + || strcmp(kernel_name, "pow") == 0 +#endif + || (strcmp(kernel_name, "maximum") == 0) + || (strcmp(kernel_name, "minimum") == 0) || (strcmp(kernel_name, "div") == 0 && !is_executed_on_sh)) { doShapeOptimized = FALSE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c index 280e5ee..708c748 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -275,6 +275,7 @@ DEF_ELEMENT_WISE_UNARY_OP( ATAN, atan ); DEF_ELEMENT_WISE_UNARY_OP( ATANH, atanh ); DEF_ELEMENT_WISE_UNARY_OP( ACOSH, acosh ); DEF_ELEMENT_WISE_UNARY_OP( INVERSE_SIGMOID, inverse_sigmoid ); +DEF_ELEMENT_WISE_UNARY_OP( TAN, tan ); #undef DEF_ELEMENT_UNARY_WISE_OP diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c index b91fec8..484f6a6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c @@ -37,311 +37,6 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" -static vsi_status _set_fc_relu_parameter - ( - vsi_nn_node_t * self, - vx_nn_convolution_relu_pooling_params_t * param - ); - -static vsi_status _set_fc_relu_parameter - ( - vsi_nn_node_t * self, - vx_nn_convolution_relu_pooling_params_t * param - ) -{ - vx_scalar pad_const; - int32_t pad_const_val; - - pad_const_val = 0; - memset( param, 0, sizeof(vx_nn_convolution_relu_pooling_params_t) ); - pad_const = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, &pad_const_val); - if( !pad_const ) - { - VSILOGE("Create scalar fail\n"); - return VSI_FAILURE; - } - - param->pad_x_left = 0; - param->pad_x_right = 0; - param->pad_y_top = 0; - param->pad_y_bottom = 0; - param->dilation_x = 0; - param->dilation_y = 0; - param->accumulator_bits = (vx_uint8)self->vx_param.accumulator_bits; - param->overflow_policy = self->vx_param.overflow_policy; - param->rounding_policy = self->vx_param.rounding_policy; - param->down_scale_size_rounding = self->vx_param.down_scale_size_rounding; - param->enable_relu = self->vx_param.has_relu; - param->pool_type = 0; - param->pool_size_x = 0; - param->pool_size_y = 0; - param->pad_mode = VX_PAD_CONSTANT; - param->pad_const = pad_const; - - return VSI_SUCCESS; -} /* _set_fc_relu_parameter() */ - -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status; - status = VSI_FAILURE; - - self->n = vxFullyConnectedReluLayer( - self->graph->g, - inputs[0]->t, - inputs[1]->wb, - 0, - 0, - self->vx_param.overflow_policy, - self->vx_param.rounding_policy, - self->vx_param.down_scale_size_rounding, - self->vx_param.has_relu, - outputs[0]->t - ); - - if( NULL != self->n ) - { - status = VSI_SUCCESS; - } - return status; -} /* op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_bool ret = FALSE; - - /* Check fl and scale*/ - ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); - - if(ret) { - /* check inputs outputs data type */ - /* NN Support */ - BEGIN_IO_TYPE_DECL(FCL_RELU, 3, 1) - /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */ - /* NN Support - I8 */ - IO_TYPE(D_I8|Q_SYM_PC, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_SYM_PC) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM) - - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) - - /* NN Support - U8 */ - IO_TYPE(D_U8|Q_SYM_PC, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_SYM_PC) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) - - /* NN Support - I16 */ - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - - /* NN Support - F16 */ - IO_TYPE(D_F16, D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_F16, D_U8|Q_ASYM) - - /* NN Support - BF16 */ - IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) - IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) - - /* NN Support - F32 */ - IO_TYPE(D_F32, D_BF16, D_F32, D_F32) - IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) - /* HW 9.0.1 */ - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - END_IO_TYPE_DECL(FCL_RELU) - ret = VALIDATE_OP_IO_TYPES(FCL_RELU, self, inputs, self->input.num, outputs, self->output.num); - - /* TP Support */ - if (!ret ) { - uint32_t valid_dtypes[] = { - D_F16, D_BF16, D_F32, D_I16|Q_DFP, D_I16|Q_SYM, D_I16|Q_ASYM, D_I8|Q_DFP, D_I8|Q_SYM, - D_I8|Q_ASYM, D_U8|Q_DFP, D_U8|Q_ASYM - }; - - uint32_t weight_type = inputs[1]->attr.dtype.vx_type | inputs[1]->attr.dtype.qnt_type << Q_SHIFT; - uint32_t inputs_types[3] = { 0 }; - vsi_bool supported[3] = { FALSE, FALSE, FALSE }; - int i = 0; - - inputs_types[0] = inputs[0]->attr.dtype.vx_type | inputs[0]->attr.dtype.qnt_type << Q_SHIFT; - inputs_types[2] = outputs[0]->attr.dtype.vx_type | outputs[0]->attr.dtype.qnt_type << Q_SHIFT; - if (inputs[2]) { - switch(inputs[1]->attr.dtype.vx_type) { - case D_F16: - case D_BF16: - case D_F32: - if(inputs[2]->attr.dtype.vx_type == (vsi_nn_type_e)D_F32) { - inputs_types[1] = weight_type; - } - break; - case D_I16: - case D_I8: - case D_U8: - if (inputs[2]->attr.dtype.vx_type == (vsi_nn_type_e)D_I32 || - inputs[2]->attr.dtype.vx_type == (vsi_nn_type_e)D_I64) { - inputs_types[1] = weight_type; - } - break; - default: - break; - } - } else { - inputs_types[1] = weight_type; - } - - for (i = 0; i < 3; i++) { - supported[i] = is_item_in_array(&inputs_types[i], valid_dtypes, - sizeof(uint32_t), _cnt_of_array(valid_dtypes)); - } - - ret = supported[0] && supported[1] && supported[2]; - } - - if(!ret) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - vsi_nn_safe_free(desc); - return FALSE; - } - } - return ret; -} /* op_check() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_bool ret; - vx_nn_convolution_relu_pooling_params_t p; - vx_weights_biases_parameter_optimizations_ext_t opt; - vx_weights_biases_parameter_optimizations_ext_t * p_opt; - -#ifdef VX_CONVERT_POLICY_WRAP_ENABLE - if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) - { - self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; - } -#endif - - ret = vsi_nn_OpSetup( VSI_NN_OP_FCL, self, inputs, outputs ); - - /* Prepare weight_bias */ - if(inputs[1]->wb == NULL) - { - if( _set_fc_relu_parameter( self, &p ) != VSI_SUCCESS ) - { - VSILOGE("set fc_relu weightbias parameter fail\n"); - return FALSE; - } - - p_opt = NULL; - memset( &opt, 0, sizeof( opt ) ); - if( outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC - || inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - opt.inputZeroPoint = inputs[0]->attr.dtype.zero_point; - } - opt.zrl = -1; - opt.outputFormat = outputs[0]->attr.dtype.vx_type; - opt.num_of_input_dims = inputs[0]->attr.dim_num; - opt.num_of_output_dims = outputs[0]->attr.dim_num; - p_opt = &opt; - -#ifdef VSI_40BIT_VA_SUPPORT - { - vx_size size_input0[VSI_NN_MAX_DIM_NUM]; - vx_size size_output0[VSI_NN_MAX_DIM_NUM]; - size_t i = 0; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - size_input0[i] = (vx_size)inputs[0]->attr.size[i]; - size_output0[i] = (vx_size)outputs[0]->attr.size[i]; - } - inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3( - VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER, - size_input0, - size_output0, - size_output0, - &p, - sizeof(p), - (vx_weights_biases_parameter_optimizations_t *)p_opt, - sizeof(opt), - inputs[1]->t, inputs[2]->t - ); - } -#else - { - uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM]; - uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM]; - size_t i = 0; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i]; - size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i]; - } - inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3( - VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER, - size_u32_input0, - size_u32_output0, - size_u32_output0, - &p, - sizeof(p), - (vx_weights_biases_parameter_optimizations_t *)p_opt, - sizeof(opt), - inputs[1]->t, inputs[2]->t - ); - } -#endif - if( p.pad_const ) - { - vxReleaseScalar( &p.pad_const ); - } - } - - if( NULL == inputs[1]->wb ) - { - VSILOGE( "Create weight bias fail." ); - ret = FALSE; - } - - return ret; -} /* op_setup() */ - #ifdef __cplusplus extern "C" { #endif @@ -350,10 +45,10 @@ DEF_OP_REG ( /* op_name */ FCL_RELU, /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, - /* setup */ op_setup, + /* compute */ NULL, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ NULL, /* optimize */ NULL, /* input_num */ 3, /* output_num */ 1 diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c index be32f48..e7d9358 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -164,13 +164,14 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ uint32_t i = 0; vsi_nn_gather_param * p = NULL; + uint32_t batch_dims = (uint32_t)self->nn_param.gather.batch_dims; if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { uint32_t j = 0; - uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num; - uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num; - uint32_t o_rank = r_rank + q_rank - 1; + uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : ((uint32_t)inputs[0]->attr.dim_num - batch_dims); + uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : ((uint32_t)inputs[1]->attr.dim_num - batch_dims); + uint32_t o_rank = r_rank + q_rank - 1 + batch_dims; p = &(self->nn_param.gather); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c index 09e96a1..ea22bf7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c @@ -38,86 +38,6 @@ #define _INPUT_NUM (4) #define _OUTPUT_NUM (3) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) - -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_FAILURE; - vsi_nn_kernel_param_t * param = NULL; - - param = vsi_nn_kernel_param_create(); - - vsi_nn_kernel_param_add_float32( param, "height_stride", self->nn_param.generate_proposals.height_stride ); - vsi_nn_kernel_param_add_float32( param, "width_stride", self->nn_param.generate_proposals.width_stride ); - vsi_nn_kernel_param_add_int32( param, "pre_nms_top_n", self->nn_param.generate_proposals.pre_nms_top_n); - vsi_nn_kernel_param_add_int32( param, "post_nms_top_n", self->nn_param.generate_proposals.post_nms_top_n); - vsi_nn_kernel_param_add_float32( param, "iou_threshold", self->nn_param.generate_proposals.iou_threshold ); - vsi_nn_kernel_param_add_float32( param, "min_size", self->nn_param.generate_proposals.min_size ); - - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "cpu beckend conv2d", - inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); - - if( self->n ) - { - status = VSI_SUCCESS; - } - - vsi_nn_kernel_param_release( ¶m ); - return status; -} /* op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - VSI_UNREFERENCED(self); - VSI_UNREFERENCED(inputs); - VSI_UNREFERENCED(outputs); - return TRUE; -} /* op_check() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) - { - vsi_nn_generate_proposals_param * p; - int32_t num_output_rois; - p = &(self->nn_param.generate_proposals); - num_output_rois = (int32_t)vsi_nn_GetElementNum(inputs[0]); - if(p->pre_nms_top_n > 0) - { - num_output_rois = p->pre_nms_top_n; - } - if(p->post_nms_top_n > 0) - { - num_output_rois = p->post_nms_top_n; - } - - outputs[0]->attr.dim_num = 1; - outputs[0]->attr.size[0] = num_output_rois; - - outputs[1]->attr.dim_num = 2; - outputs[1]->attr.size[0] = 4; - outputs[1]->attr.size[1] = num_output_rois; - - outputs[2]->attr.dim_num = 1; - outputs[2]->attr.size[0] = num_output_rois; - } - return TRUE; -} /* op_setup() */ #ifdef __cplusplus extern "C" { @@ -127,10 +47,10 @@ DEF_OP_REG ( /* op_name */ GENERATE_PROPOSALS, /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, - /* setup */ op_setup, + /* compute */ NULL, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ NULL, /* optimize */ NULL, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c index a404979..55f0ef3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c @@ -37,13 +37,17 @@ #include "vsi_nn_internal_node.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_tensor_util_prv.h" #include "vsi_nn_error.h" #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) static vsi_nn_tensor_t * _expand_tensor_dim - ( vsi_nn_graph_t * graph, vsi_nn_tensor_t *tensor, vsi_size_t * shape, vsi_size_t rank, vsi_size_t expand_dim ) + ( vsi_nn_graph_t * graph, vsi_nn_tensor_t *tensor, + vsi_size_t * shape, vsi_size_t rank, vsi_size_t expand_dim, + vsi_nn_opt_direction_e direction, + vsi_bool is_use_reshpe_node) { vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_size_t i, cnt; @@ -66,8 +70,14 @@ static vsi_nn_tensor_t * _expand_tensor_dim { new_shape[cnt] = 1; } - - return vsi_nn_reshape_tensor( graph, tensor, new_shape, rank + 1 ); + if (is_use_reshpe_node) + { + return vsi_nn_kernel_insert_reshape_node(graph, tensor, new_shape, (uint32_t)(rank + 1), direction); + } + else + { + return vsi_nn_reshape_tensor(graph, tensor, new_shape, rank + 1); + } } /* _expand_tensor_dim() */ static vsi_status op_compute @@ -127,13 +137,13 @@ static vsi_bool op_setup } p->local->input = _expand_tensor_dim( self->graph, inputs[0], - inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); + inputs[0]->attr.size, inputs[0]->attr.dim_num, 0, VSI_NN_OPTIMIZE_BACKWARD, TRUE); if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC && inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8) { p->local->weight = _expand_tensor_dim( self->graph, inputs[1], - inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); + inputs[1]->attr.size, inputs[1]->attr.dim_num, 0, VSI_NN_OPTIMIZE_BACKWARD, FALSE); } else { @@ -160,7 +170,7 @@ static vsi_bool op_setup } p->local->output = _expand_tensor_dim( self->graph, outputs[0], - outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); + outputs[0]->attr.size, outputs[0]->attr.dim_num, 0, VSI_NN_OPTIMIZE_FORWARD, TRUE); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_GROUPED_CONV2D, 0, 0); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c index 772af14..72fadad 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c @@ -61,8 +61,10 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { +#define _TENSOR_LEN 64 vsi_bool res; uint32_t i; + char tensor_name[_TENSOR_LEN]; vsi_nn_grouped_conv2d_param *nn_param = &self->nn_param.grouped_conv2d; nn_param->local = (vsi_nn_grouped_conv2d_param_local_data*)malloc( sizeof(vsi_nn_grouped_conv2d_param_local_data)); @@ -197,6 +199,14 @@ static vsi_status op_compute sizeof(vx_nn_convolution_params_ext2_t), LOCAL()->output_tensor_group[i]->t ); + + memset(tensor_name, 0, sizeof(tensor_name)); + snprintf(tensor_name, sizeof(tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, i); + if(vxSetReferenceName((vx_reference)LOCAL()->output_tensor_group[i]->t, tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u copy node output name fail", self->uid); + return VSI_FAILURE; + } if( NULL == self->n ) { VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c index 629486c..31f7abc 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c @@ -36,6 +36,7 @@ #include "vsi_nn_log.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) @@ -85,6 +86,54 @@ static vsi_bool _is_3d_group_norm return FALSE; } /* _is_3d_group_norm() */ +static vsi_nn_tensor_t* _pad_tensor_per_pixel + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_t *input_tensor, + vsi_size_t scale_size_in, + vsi_size_t pad_size_per_pixel + ) +{ + float* f32_in_buffer = NULL; + float* f32_out_buffer = NULL; + vsi_size_t i = 0, j = 0; + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* output_tensor = NULL; + + f32_out_buffer= (float *)malloc(pad_size_per_pixel * scale_size_in * sizeof(float)); + CHECK_PTR_FAIL_GOTO( f32_out_buffer, "Create buffer fail.", final ); + memset(f32_out_buffer, 0, pad_size_per_pixel * scale_size_in * sizeof(float)); + f32_in_buffer = vsi_nn_ConvertTensorToFloat32Data(graph, input_tensor); + if (NULL == f32_in_buffer) + { + output_tensor = NULL; + goto final; + } + + for ( i = 0; i < scale_size_in; i++ ) + { + for (j = 0; j < pad_size_per_pixel; j ++) + { + f32_out_buffer[i * pad_size_per_pixel + j] = f32_in_buffer[i]; + } + } + + memcpy(&attr, &input_tensor->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.size[0] = pad_size_per_pixel; + attr.size[1] = scale_size_in; + attr.dim_num = 2; + output_tensor = vsi_nn_CreateTensorFromData( + graph, + (uint8_t *)f32_out_buffer, + &attr); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create tensor fail.", final ); +final: + vsi_nn_safe_free(f32_in_buffer) + vsi_nn_safe_free(f32_out_buffer) + + return output_tensor; +} + static vsi_status _op_compute ( vsi_nn_node_t * self, @@ -100,6 +149,7 @@ static vsi_status _op_compute vsi_nn_tensor_t * tmp_inputs[3] = {NULL, NULL, NULL}; vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; vsi_nn_groupnorm_lcl_data *local = self->nn_param.groupnorm.lcl_data; + vsi_bool pad_scale_bias = FALSE; status = _try_set_high_presision_tensor(inputs); if (status != VSI_SUCCESS) @@ -123,6 +173,19 @@ static vsi_status _op_compute tmp_inputs[2] = inputs[2]; } + pad_scale_bias = vsi_nn_GetElementNum(inputs[1]) == (vsi_size_t)group_num && + (vsi_size_t)group_num < tmp_inputs[0]->attr.size[2]; + + if (pad_scale_bias) + { + tmp_inputs[1] = _pad_tensor_per_pixel(self->graph, tmp_inputs[1], + group_num, tmp_inputs[0]->attr.size[2] / group_num); + tmp_inputs[2] = _pad_tensor_per_pixel(self->graph, tmp_inputs[2], + group_num, tmp_inputs[0]->attr.size[2] / group_num); + CHECK_PTR_FAIL_GOTO( tmp_inputs[1], "Create tensor fail.", final ); + CHECK_PTR_FAIL_GOTO( tmp_inputs[2], "Create tensor fail.", final ); + } + param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_float32( param, "eps", eps ); vsi_nn_kernel_param_add_int32( param, "group_num", group_num ); @@ -139,6 +202,13 @@ static vsi_status _op_compute vsi_nn_kernel_param_release( ¶m ); } +final: + if (pad_scale_bias) + { + vsi_safe_release_tensor(tmp_inputs[1]); + vsi_safe_release_tensor(tmp_inputs[2]); + } + return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c index cc4b443..e81dd75 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c @@ -39,69 +39,6 @@ #define _INPUT_NUM (2) #define _OUTPUT_NUM (2) -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_FAILURE; - - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, - "heatmap_max_keypoint", - inputs, _INPUT_NUM, - outputs, _OUTPUT_NUM, NULL ); - - if( self->n ) - { - status = VSI_SUCCESS; - } - - return status; -} - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - VSI_UNREFERENCED(self); - VSI_UNREFERENCED(inputs); - VSI_UNREFERENCED(outputs); - /*TODO: Check tensor shapes. */ - return TRUE; -} /* op_check() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - VSI_UNREFERENCED(self); - - if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) - { - outputs[0]->attr.dim_num = 2; - outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; - outputs[0]->attr.size[1] = inputs[0]->attr.size[3]; - } - - if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) - { - outputs[1]->attr.dim_num = 3; - outputs[1]->attr.size[0] = 2; - outputs[1]->attr.size[1] = inputs[0]->attr.size[0]; - outputs[1]->attr.size[2] = inputs[0]->attr.size[3]; - } - - return TRUE; -} /* op_setup() */ - #ifdef __cplusplus extern "C" { #endif @@ -110,10 +47,10 @@ DEF_OP_REG ( /* op_name */ HEATMAP_MAX_KEYPOINT, /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, - /* setup */ op_setup, + /* compute */ NULL, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ NULL, /* optimize */ NULL, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c index 5386af7..5be9bb1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c @@ -34,91 +34,11 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -struct _scaletotensor_kernel_params -{ - int32_t ratio[2]; - int32_t offset[2]; - float mean[3]; - float scale; -}; - -typedef struct _scaletotensor_kernel_params scaletotensor_kernel_params_t; - - -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - - VSI_UNREFERENCED(self); - VSI_UNREFERENCED(inputs); - VSI_UNREFERENCED(outputs); - - return status; -} /* op_compute() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_nn_imageprocess_param * p; - uint32_t i; - p = (vsi_nn_imageprocess_param *)&(self->nn_param.imageprocess); - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) - { - /* TODO */ - if (inputs[0]->attr.dim_num != 4) - { - VSILOGE("Only support 4D tensor for image process!(IMAGEPROCESS)\n"); - return FALSE; - } - if (p->reverse_channel == TRUE && inputs[0]->attr.size[2] != 3) - { - VSILOGE("Only support 3 channels for reverse channel!(IMAGEPROCESS)\n"); - return FALSE; - } - - if (p->resize.type != VSI_NN_IMAGEPROCESS_RESIZE_NONE) - { - outputs[0]->attr.dim_num = p->resize.dim_num; - for(i = 0; i < (uint32_t)p->resize.dim_num; ++i) - { - outputs[0]->attr.size[i] = p->resize.length[i]; - } - } - else if (p->crop.enable == TRUE) - { - outputs[0]->attr.dim_num = p->crop.dim_num; - for(i = 0; i < (uint32_t)p->crop.dim_num; ++i) - { - outputs[0]->attr.size[i] = p->crop.length[i]; - } - } - else - { - // CWHN -> WHCN - outputs[0]->attr.size[0] = inputs[0]->attr.size[1]; - outputs[0]->attr.size[1] = inputs[0]->attr.size[2]; - outputs[0]->attr.size[2] = inputs[0]->attr.size[0]; - outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; - } - } - return TRUE; -} /* op_setup() */ - vsi_status vsi_nn_op_imageprocess_single_node ( vsi_nn_graph_t *graph, @@ -150,10 +70,10 @@ DEF_OP_REG ( /* op_name */ IMAGEPROCESS, /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, + /* compute */ NULL, + /* deinit */ NULL, /* check */ NULL, - /* setup */ op_setup, + /* setup */ NULL, /* optimize */ NULL, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c index 53c12ae..487e89c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c @@ -131,15 +131,15 @@ static vsi_status op_compute vsi_nn_optimize_instance_norm_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank); - tmp_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], shape, new_rank ); + tmp_tensors[0] = vsi_nn_kernel_insert_reshape_node( self->graph, + inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD ); tmp_tensors[1] = inputs[1]; tmp_tensors[2] = inputs[2]; - tmp_tensors[3] = vsi_nn_reshape_tensor( self->graph, - outputs[0], shape, new_rank ); + tmp_tensors[3] = vsi_nn_kernel_insert_reshape_node( self->graph, + outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD ); status = _try_set_high_presision_tensor(tmp_tensors); - if(status != VSI_SUCCESS) + if (status != VSI_SUCCESS) { VSILOGE("Set tensor attr of high presision fail"); return status; @@ -150,7 +150,7 @@ static vsi_status op_compute n = vsi_nn_kernel_selector( self->graph, "instance_norm", tmp_tensors, _INPUT_NUM, &tmp_tensors[3], _OUTPUT_NUM, param ); - if( n != NULL ) + if ( n != NULL ) { self->n = (vx_node)n; status = VSI_SUCCESS; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index a90ae59..fe22781 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -55,10 +55,12 @@ static vsi_status op_compute float eps = self->nn_param.layernorm.eps; int32_t axis = self->nn_param.layernorm.axis; +#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) if ( self->nn_param.layernorm.local->use_internal_node ) { return vsi_nn_internal_compute_node( self ); } +#endif param = vsi_nn_kernel_param_create(); @@ -88,14 +90,18 @@ static vsi_bool op_setup ) { vsi_bool ret = TRUE; + +#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) int32_t axis = 0; vsi_nn_internal_node_t* curr = NULL; +#endif if ( NULL == self ) { return FALSE; } +#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) axis = self->nn_param.layernorm.axis; vsi_nn_internal_init_node_wksp( self ); @@ -147,11 +153,14 @@ static vsi_bool op_setup ret = vsi_nn_internal_setup_node( self, curr ); } else +#endif { ret = vsi_nn_op_common_setup(self, inputs, outputs); } +#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) final: +#endif return ret; } @@ -236,9 +245,11 @@ static vsi_status op_init self->nn_param.layernorm.axis = 0; +#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) self->nn_param.layernorm.local = (vsi_nn_layernorm_lcl_data *)malloc(sizeof(vsi_nn_layernorm_lcl_data)); memset(self->nn_param.layernorm.local, 0x00, sizeof(vsi_nn_layernorm_lcl_data)); self->nn_param.layernorm.local->use_internal_node = FALSE; +#endif return status; } @@ -250,7 +261,9 @@ static vsi_status op_deinit { vsi_nn_safe_free(self->nn_param.layernorm.local); +#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) vsi_nn_internal_deinit_node_wksp( self ); +#endif vsi_nn_op_common_deinit(self); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c index 34c329c..377ba26 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c @@ -46,14 +46,9 @@ static vsi_status _log_softmax_op_compute ) { vsi_status status; - vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; - vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; - uint32_t rank_in = 0; int32_t axis = 0; - int32_t new_axis = 0; float betaValue = 0; - vsi_bool ret; vsi_nn_kernel_param_t * param = NULL; vsi_nn_log_softmax_param * p = NULL; @@ -69,33 +64,19 @@ static vsi_status _log_softmax_op_compute // TODO: This optimzie is a hack for gpu path, // it should be moved to gpu kernel setup. - ret = vsi_nn_kernel_optimize_softmax_shape( - inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, - shapes[0], &rank_in, &new_axis); - if( ret ) - { - // Add params - param =vsi_nn_kernel_param_create(); + param =vsi_nn_kernel_param_create(); - vsi_nn_kernel_param_add_int32( param, "axis", new_axis ); - vsi_nn_kernel_param_add_float32( param, "beta", betaValue ); + vsi_nn_kernel_param_add_int32( param, "axis", axis ); + vsi_nn_kernel_param_add_float32( param, "beta", betaValue ); - reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], shapes[0], rank_in ); - reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - outputs[0], shapes[0], rank_in ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, + inputs, 1, + outputs, 1, param ); - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, - kernel_name, - &reshape_tensors[0], 1, - &reshape_tensors[1], 1, param ); + vsi_nn_kernel_param_release( ¶m ); - vsi_nn_ReleaseTensor( &reshape_tensors[0] ); - vsi_nn_ReleaseTensor( &reshape_tensors[1] ); - - vsi_nn_kernel_param_release( ¶m ); - } if( self->n ) { status = VSI_SUCCESS; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c index ebd17a3..65f22a3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c @@ -224,6 +224,7 @@ static vsi_bool op_setup uint32_t batch_size = 0; uint32_t time_step = 0; uint32_t i = 0; + size_t k = 0; vsi_bool ret = FALSE; vsi_status status = VSI_FAILURE; @@ -329,6 +330,17 @@ static vsi_bool op_setup curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias; curr->node->nn_param.lstmunit_ovxlib.proj_clip = curr_param->proj_clip; curr->node->nn_param.lstmunit_ovxlib.recurrent_activation = curr_param->recurrent_activation; + if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + for (k = 0; k < _cnt_of_array( curr_param->internal_dtype ); k++) + { + if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE) + { + curr_param->internal_dtype[k] = reshape_output->attr.dtype; + } + } + } memcpy( curr->node->nn_param.lstmunit_ovxlib.internal_dtype, curr_param->internal_dtype, sizeof( curr_param->internal_dtype ) ); curr->inputs[LSTMUNIT_INPUT_INPUT] = reshape_output; @@ -361,6 +373,21 @@ static vsi_bool op_setup curr->inputs[LSTMUNIT_INPUT_LAYERNORM_C] = inputs[LSTM_INPUT_LAYERNORM_C]; curr->inputs[LSTMUNIT_INPUT_LAYERNORM_O] = inputs[LSTM_INPUT_LAYERNORM_O]; + if (self->input.num > LSTM_INPUT_BIAS_R2I) + { + curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = inputs[LSTM_INPUT_BIAS_R2I]; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = inputs[LSTM_INPUT_BIAS_R2F]; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = inputs[LSTM_INPUT_BIAS_R2C]; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = inputs[LSTM_INPUT_BIAS_R2O]; + } + else + { + curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = NULL; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = NULL; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = NULL; + curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = NULL; + } + curr->outputs[LSTMUNIT_OUTPUT_OUTPUT] = lstmunit_out0; curr->outputs[LSTMUNIT_OUTPUT_H_STATE] = lstmunit_out1; curr->outputs[LSTMUNIT_OUTPUT_C_STATE] = lstmunit_out2; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c index 4bf4443..755c63d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c @@ -303,7 +303,7 @@ static vsi_bool op_setup vsi_nn_internal_tensor_t* input_add_aux_input_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; vsi_nn_internal_tensor_t* recurrent_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; vsi_nn_internal_tensor_t* layernorm_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; - vsi_nn_tensor_t* bias_tensors[LSTMUNIT_IFCO_GATE_COUNT] = { NULL }; + vsi_nn_tensor_t* bias_tensors[LSTMUNIT_IFCO_GATE_COUNT * 2] = { NULL }; vsi_nn_tensor_t* zero_bias_tensor = NULL; vsi_nn_internal_node_t* curr = NULL; int32_t ifco_start_index = 0; @@ -362,7 +362,7 @@ static vsi_bool op_setup setup_op_shapes(self, inputs, outputs); - for( i = 0; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + for( i = 0; i < LSTMUNIT_IFCO_GATE_COUNT * 2; i++) { if( p->local->use_layer_norm || p->local->use_hybrid ) { @@ -370,7 +370,18 @@ static vsi_bool op_setup } else { - bias_tensors[i] = inputs[LSTMUNIT_INPUT_BIAS_I + i]; + if(i < LSTMUNIT_IFCO_GATE_COUNT) + { + bias_tensors[i] = inputs[LSTMUNIT_INPUT_BIAS_I + i]; + } + else if(self->input.num > LSTM_INPUT_BIAS_R2I) + { + bias_tensors[i] = inputs[LSTMUNIT_INPUT_BIAS_R2I + i - LSTMUNIT_IFCO_GATE_COUNT]; + } + else + { + bias_tensors[i] = NULL; + } } } @@ -486,7 +497,7 @@ static vsi_bool op_setup recurrent_fc_outputs[i] = create_tp_fc(self, inputs[LSTMUNIT_INPUT_H_STATE], inputs[LSTMUNIT_INPUT_WEIGHT_R2I + i], - NULL, + bias_tensors[LSTMUNIT_IFCO_GATE_COUNT + i], &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i], use_virtual_tensor); CHECK_PTR_FAIL_GOTO( recurrent_fc_outputs[i], "Create tensor fail.", final ); @@ -506,7 +517,7 @@ static vsi_bool op_setup vsi_nn_internal_tensor_t* tmp = create_nn_fc(self, recurrent_input_tensor->t, inputs[LSTMUNIT_INPUT_WEIGHT_R2I + i], - NULL, + bias_tensors[LSTMUNIT_IFCO_GATE_COUNT + i], kernel_h, kernel_w, &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i], use_virtual_tensor); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c index 146ee33..5e91fbc 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c @@ -37,6 +37,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" +#if !(VX_TRANSPOSE_OPT_SUPPORT) static vsi_bool _is_same_memory_shape ( vsi_nn_node_t * self, @@ -116,6 +117,7 @@ static vsi_bool _is_same_quant return TRUE; } /* _is_same_quant */ +#endif static vsi_status op_compute ( @@ -242,6 +244,14 @@ static vsi_status op_optimize status = VSI_SUCCESS; +#if (VX_TRANSPOSE_OPT_SUPPORT) + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(direction); + self->nn_param.permute.local.initialized = FALSE; + + return status; +#else if (_is_same_memory_shape(self, inputs, outputs) == FALSE || _is_same_quant(self, inputs, outputs) == FALSE || (inputs[0]->t != NULL && outputs[0]->t != NULL)) @@ -285,6 +295,7 @@ static vsi_status op_optimize } return status; +#endif } /* op_optimize() */ #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index dea1770..682628c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -132,7 +132,9 @@ static vsi_bool op_setup p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422 || - p->type == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422 + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422 || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR ) { uint32_t i = 0; @@ -487,6 +489,8 @@ static vsi_bool op_setup break; case VSI_NN_SOURCE_FORMAT_IMAGE_NV21: case VSI_NN_SOURCE_FORMAT_IMAGE_NV12: + case VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR: + case VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 ); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); @@ -514,10 +518,18 @@ static vsi_bool op_setup { curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV12; } - else + else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21) { curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV21; } + else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB) + { + curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV12_RGGB; + } + else + { + curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV21_BGGR; + } curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel; curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left; @@ -618,7 +630,9 @@ static vsi_bool op_setup p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY || (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR && !enable_rgb88_planar_nhwc) || - (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP && !enable_rgb88_planar_nhwc) + (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP && !enable_rgb88_planar_nhwc) || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR ) { if (layout == VSI_NN_DEST_LAYOUT_NHWC) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c index 7fa635a..f02fff9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c @@ -63,7 +63,15 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_nv12.local->enable_perm ); vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_nv12.local->enable_copy ); vsi_nn_kernel_param_add_int32( param, "nv_type", self->nn_param.pre_process_nv12.nv_type ); - n = vsi_nn_kernel_selector( self->graph, "pre_process_nv12", inputs, 2, outputs, 1, param ); + if (self->nn_param.pre_process_nv12.nv_type == VSI_NN_YUV_TYPE_NV12 || + self->nn_param.pre_process_nv12.nv_type == VSI_NN_YUV_TYPE_NV21) + { + n = vsi_nn_kernel_selector( self->graph, "pre_process_nv12", inputs, 2, outputs, 1, param ); + } + else + { + n = vsi_nn_kernel_selector( self->graph, "pre_process_nv12_rggb", inputs, 2, outputs, 1, param ); + } if( n != NULL ) { self->n = (vx_node)n; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c index c203fdd..095d4d6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c @@ -35,65 +35,8 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status; - status = VSI_FAILURE; - - /* TODO */ - /* example code : add op */ - /* - self->n = vxTensorAddNode( self->graph->g, inputs[0]->t, inputs[1]->t, - VX_CONVERT_POLICY_SATURATE, outputs[0]->t ); - */ - - VSI_UNREFERENCED(inputs); - VSI_UNREFERENCED(outputs); - - if( NULL != self->n ) - { - status = VSI_SUCCESS; - } - return status; -} /* op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - VSI_UNREFERENCED(self); - VSI_UNREFERENCED(inputs); - VSI_UNREFERENCED(outputs); - /*TODO: Check tensor shapes. */ - return TRUE; -} /* op_check() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - VSI_UNREFERENCED(self); - VSI_UNREFERENCED(inputs); - VSI_UNREFERENCED(outputs); - - /* TODO: Add code to comput outputs' shape. */ - return TRUE; -} /* op_setup() */ - #ifdef __cplusplus extern "C" { #endif @@ -102,10 +45,10 @@ DEF_OP_REG ( /* op_name */ QUANTIZED_16BIT_LSTM, /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, - /* setup */ op_setup, + /* compute */ NULL, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ NULL, /* optimize */ NULL, /* input_num */ Q16_LSTM_INPUT_CNT, /* output_num */ Q16_LSTM_OUTPUT_CNT diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c index 523eeb4..418c6a0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c @@ -55,7 +55,7 @@ static vsi_status op_compute self->nn_param.reshape.local.initialized == FALSE) { vsi_status status = VSI_SUCCESS; -#ifdef VX_REMOVE_RESHAPE_SUPPORT +#if VX_REMOVE_RESHAPE_SUPPORT vsi_nn_tensor_attr_t attr; vsi_nn_tensor_t *dims_tensor = NULL; vx_nn_reshape_params_t reshape_param; @@ -147,8 +147,11 @@ static vsi_status op_optimize vsi_status status; status = VSI_SUCCESS; -#ifdef VX_REMOVE_RESHAPE_SUPPORT +#if VX_REMOVE_RESHAPE_SUPPORT self->nn_param.reshape.local.initialized = FALSE; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(direction); #else if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c index 4395961..6e1c313 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c @@ -53,7 +53,7 @@ static vsi_status op_compute if (inputs[0]->t != NULL && outputs[0]->t != NULL && self->nn_param.reshape2.local->initialized == FALSE) { -#ifdef VX_REMOVE_RESHAPE_SUPPORT +#if VX_REMOVE_RESHAPE_SUPPORT vsi_nn_tensor_attr_t attr; vsi_nn_tensor_t *dims_tensor = NULL; vx_nn_reshape_params_t reshape_param; @@ -179,8 +179,11 @@ static vsi_status op_optimize vsi_status status; status = VSI_SUCCESS; -#ifdef VX_REMOVE_RESHAPE_SUPPORT +#if VX_REMOVE_RESHAPE_SUPPORT self->nn_param.reshape2.local->initialized = FALSE; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(direction); #else if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c index 1a719af..97fad8f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c @@ -105,6 +105,10 @@ static vsi_status op_compute snprintf(kernel_name, sizeof(kernel_name), "resize_bilinear"); break; + case VSI_NN_INTERPOLATION_CUBIC: + snprintf(kernel_name, sizeof(kernel_name), + "resize_cubic"); + break; default: break; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c index e3e19ad..f6721b6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c @@ -53,6 +53,7 @@ static vsi_status op_compute uint32_t idx_num = 1; vsi_size_t *input_size = inputs[2]->attr.size; uint32_t dims_num = inputs[2]->attr.dim_num; + vsi_nn_reduction_type_e reduction = self->nn_param.scatter_nd_update.reduction; if (inputs[1]->attr.dim_num > 1) { @@ -75,7 +76,17 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim ); vsi_nn_kernel_param_add_int32( param, "idx_num", idx_num ); - n = vsi_nn_kernel_selector( self->graph, "scatter_nd_update", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + vsi_nn_kernel_param_add_int32( param, "reduction", reduction ); + if (reduction > VSI_NN_REDUCTION_TYPE_NONE) + { + n = vsi_nn_kernel_selector( self->graph, "scatter_nd_update_reduction", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + } + else + { + n = vsi_nn_kernel_selector( self->graph, "scatter_nd_update", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + } if ( n != NULL ) { self->n = (vx_node)n; @@ -155,6 +166,18 @@ static vsi_bool op_setup return TRUE; } /* op_setup() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.scatter_nd_update.reduction = VSI_NN_REDUCTION_TYPE_NONE; + + return status; +} /* op_init() */ + static vsi_status op_deinit ( vsi_nn_node_t * self @@ -172,7 +195,7 @@ extern "C" { DEF_OP_REG ( /* op_name */ SCATTER_ND_UPDATE, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index d6e6e90..84c2dd7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -572,6 +572,10 @@ static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_para const int32_t *stride_dims = op_params->stride_dims; strided_slice_param *params = &op_params->lcl2_data->params; + params->begin_dims_num = 0; + params->end_dims_num = 0; + params->stride_dims_num = 0; + begin_mask = _reverse_mask_bits(begin_mask, input_dims); end_mask = _reverse_mask_bits(end_mask, input_dims); shrink_axis_mask = _reverse_mask_bits(shrink_axis_mask, input_dims); @@ -762,7 +766,8 @@ static vsi_status op_optimize vsi_size_t output_elements = 0; /* Only forward run stride_slice's optimize */ - if ( direction == VSI_NN_OPTIMIZE_BACKWARD ) + if ( direction == VSI_NN_OPTIMIZE_BACKWARD || + !self->graph->ctx->options.enable_slice_optimize ) { return status; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c index ff8c0e0..0be22cd 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c @@ -111,6 +111,11 @@ static vsi_status op_compute vsi_nn_tensor_t * out1_tensor = NULL; vsi_bool ret = FALSE; + if (inputs[0]->attr.size[axis] == 1) + { + return vsi_nn_internal_compute_node( self ); + } + ret = vsi_nn_kernel_optimize_softmax_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, shapes[0], &rank_in, &new_axis0); @@ -259,13 +264,12 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ uint32_t i; + vsi_nn_topk_param * p; + + p = &(self->nn_param.topk); if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - vsi_nn_topk_param * p; - - p = &(self->nn_param.topk); - outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; outputs[0]->attr.size[p->axis] = p->k; for (i = 0; i < inputs[0]->attr.dim_num; i++) @@ -280,10 +284,6 @@ static vsi_bool op_setup if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) { - vsi_nn_topk_param * p; - - p = &(self->nn_param.topk); - outputs[1]->attr.dim_num = inputs[0]->attr.dim_num; outputs[1]->attr.size[p->axis] = p->k; for (i = 0; i < inputs[0]->attr.dim_num; i++) @@ -296,9 +296,58 @@ static vsi_bool op_setup } } + if (inputs[0]->attr.size[p->axis] == 1) + { + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* const0_input = NULL; + vsi_nn_tensor_attr_t attr; + + vsi_nn_internal_init_node_wksp(self); + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + + memcpy(&attr, &outputs[1]->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.vtl = FALSE; + attr.is_const = TRUE; + + const0_input = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(const0_input, "Create tensor failed", final); + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = const0_input->t; + curr->outputs[0] = outputs[1]; + vsi_nn_internal_setup_node(self, curr); + } + return TRUE; +final: + return FALSE; } /* op_setup() */ +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_nn_topk_param * p; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + + p = &(self->nn_param.topk); + if (inputs[0]->attr.size[p->axis] == 1) + { + return vsi_nn_internal_optimize_node( self, direction ); + } + + return VSI_SUCCESS; +} /* op_optimize() */ + static vsi_status op_init ( vsi_nn_node_t * self @@ -310,6 +359,17 @@ static vsi_status op_init return status; } /* op_init() */ +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_internal_deinit_node_wksp(self); + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + #ifdef __cplusplus extern "C" { #endif @@ -319,10 +379,10 @@ DEF_OP_REG /* op_name */ TOPK, /* init */ op_init, /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, + /* deinit */ op_deinit, /* check */ op_check, /* setup */ op_setup, - /* optimize */ NULL, + /* optimize */ op_optimize, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM ); diff --git a/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c b/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c index 1025604..2088eba 100644 --- a/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c +++ b/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c @@ -75,10 +75,13 @@ vsi_bool vsi_nn_QuantAffineCheck switch (dtype) { + case VSI_NN_TYPE_UINT4: case VSI_NN_TYPE_UINT8: case VSI_NN_TYPE_UINT16: case VSI_NN_TYPE_UINT32: + case VSI_NN_TYPE_INT4: case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_INT16: { double product_scale = (double)input->attr.dtype.scale * (double)weight->attr.dtype.scale; const double acuity_round_decimals = 1e-8; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index e862b9a..eb02639 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -467,6 +467,8 @@ static _op_param_gen_t s_op_gen[] = /* LPNORM */ NULL, /* RESIZE_3D */ NULL, /* REDUCEL2 */ NULL, + /* CROP_AND_RESIZE */ NULL, + /* TAN */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index 82d1aaa..6f91f99 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -152,6 +152,33 @@ char* vsi_nn_getenv return var; }; +int32_t vsi_nn_getenv_asint + ( + const char* env, + int32_t default_value + ) +{ + int32_t value = default_value; + #ifdef __ANDROID__ + { + char value_str[100]; + int32_t status = __system_property_get(env, value_str); + if (status) { + value = atoi(value_str); + } + } + #else + { + char* env_s = vsi_nn_getenv(env); + if (env_s) { + value = atoi(env_s); + } + } + #endif + + return value; +} + FILE* vsi_nn_fopen ( const char * file_name, diff --git a/src/tim/vx/internal/src/vip/virtual_device.cpp b/src/tim/vx/internal/src/vip/virtual_device.cpp index 3568f69..0635843 100644 --- a/src/tim/vx/internal/src/vip/virtual_device.cpp +++ b/src/tim/vx/internal/src/vip/virtual_device.cpp @@ -227,6 +227,10 @@ uint32_t IDevice::Id() const{ return device_->Id(); } +bool IDevice::GraphSubmit(vsi_nn_graph_t* graph, bool (*func)(const void*), data_t data) { + return device_->GraphSubmit(graph, func_t(func), data); +} + bool IDevice::GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data) { return device_->GraphSubmit(graph, func, data); } diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index 99a5e79..fa58045 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -93,70 +93,53 @@ final: return status; } -int32_t vsi_nn_getEnv(const char* name, char** env_s) { - int32_t ret = 0; - *env_s = vsi_nn_getenv(name); - if (*env_s) { - ret = TRUE; - } - return ret; -} - +#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30) +static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER"; +static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK"; +static const char* ENV_ENABLE_CONCAT_OPTIMIZE = "vendor.VSI_NN_ENABLE_CONCAT_OPTIMIZE"; +static const char* ENV_ENABLE_I8TOU8 = "vendor.VSI_NN_ENABLE_I8TOU8"; +static const char* ENV_ENABLE_DATACONVERT_OPTIMIZE = "vendor.VSI_NN_ENABLE_DATACONVERT_OPTIMIZE"; +static const char* ENV_ENABLE_STREAM_PROCESSOR = "vendor.VSI_VX_ENABLE_STREAM_PROCESSOR"; +static const char* ENV_FORCE_RGB888_OUT_NHWC = "vendor.VSI_NN_FORCE_RGB888_OUT_NHWC"; +static const char* ENV_ENABLE_SLICE_OPTIMIZE = "vendor.VSI_NN_ENABLE_SLICE_OPTIMIZE"; +static const char* ENV_ENABLE_BATCH_OPT = "vendor.VSI_VX_ENABLE_BATCH_OPT"; +#else +static const char* ENV_ENABLE_SHADER = "VIV_VX_ENABLE_SHADER"; +static const char* ENV_ENABLE_OPCHECK = "VSI_NN_ENABLE_OPCHECK"; +static const char* ENV_ENABLE_CONCAT_OPTIMIZE = "VSI_NN_ENABLE_CONCAT_OPTIMIZE"; +static const char* ENV_ENABLE_I8TOU8 = "VSI_NN_ENABLE_I8TOU8"; +static const char* ENV_ENABLE_DATACONVERT_OPTIMIZE = "VSI_NN_ENABLE_DATACONVERT_OPTIMIZE"; +static const char* ENV_ENABLE_STREAM_PROCESSOR = "VSI_VX_ENABLE_STREAM_PROCESSOR"; +static const char* ENV_FORCE_RGB888_OUT_NHWC = "VSI_NN_FORCE_RGB888_OUT_NHWC"; +static const char* ENV_ENABLE_SLICE_OPTIMIZE = "VSI_NN_ENABLE_SLICE_OPTIMIZE"; +static const char* ENV_ENABLE_BATCH_OPT = "VSI_VX_ENABLE_BATCH_OPT"; +#endif static vsi_status vsi_nn_initOptions ( vsi_nn_runtime_option_t *options ) { - char* env_s = NULL; + int32_t default_value = 1; - env_s = NULL; - options->enable_shader = 1; - if (vsi_nn_getEnv("VIV_VX_ENABLE_SHADER", &env_s) && env_s) - { - options->enable_shader = atoi(env_s); - } - - env_s = NULL; - options->enable_opcheck = 1; - if (vsi_nn_getEnv("VSI_NN_ENABLE_OPCHECK", &env_s) && env_s) - { - options->enable_opcheck = atoi(env_s); - } - - env_s = NULL; - options->enable_concat_optimize = 1; - if (vsi_nn_getEnv("VSI_NN_ENABLE_CONCAT_OPTIMIZE", &env_s) && env_s) - { - options->enable_concat_optimize = atoi(env_s); - } - - env_s = NULL; - options->enable_asymi8_to_u8 = 1; - if (vsi_nn_getEnv("VSI_NN_ENABLE_I8TOU8", &env_s) && env_s) - { - options->enable_asymi8_to_u8 = atoi(env_s); - } - - env_s = NULL; - options->enable_dataconvert_optimize = 1; - if (vsi_nn_getEnv("VSI_NN_ENABLE_DATACONVERT_OPTIMIZE", &env_s) && env_s) - { - options->enable_dataconvert_optimize = atoi(env_s); - } - - env_s = NULL; - options->enable_stream_processor = 1; - if (vsi_nn_getEnv("VSI_VX_ENABLE_STREAM_PROCESSOR", &env_s) && env_s) - { - options->enable_stream_processor = atoi(env_s); - } - - env_s = NULL; - options->enable_rgb88_planar_nhwc = 0; - if (vsi_nn_getEnv("VSI_NN_FORCE_RGB888_OUT_NHWC", &env_s) && env_s) - { - options->enable_rgb88_planar_nhwc = atoi(env_s); - } + options->enable_shader = vsi_nn_getenv_asint(ENV_ENABLE_SHADER, 1); + options->enable_opcheck = vsi_nn_getenv_asint(ENV_ENABLE_OPCHECK, 1); +#if (VX_CONCAT_OPT_SUPPORT) + default_value = 0; +#else + default_value = 1; +#endif + options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value); + options->enable_asymi8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1); + options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1); + options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1); + options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0); +#if (VX_STRIDED_SLICE_OPT_SUPPORT) + default_value = 0; +#else + default_value = 1; +#endif + options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value); + options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0); return VSI_SUCCESS; } diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index 9954d5d..ded1835 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -560,6 +560,692 @@ final: return status; } /* setup_node() */ +#if VX_GRAPH_BATCH_OPT_SUPPORT +static vsi_bool canBatchSplit +( + vsi_nn_node_t* node, + uint32_t inputBtachNum +) +{ + vsi_bool ret; + uint32_t i; + ret = TRUE; + + switch(node->op) + { + case VSI_NN_OP_SOFTMAX: + if (node->nn_param.softmax.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_LOG_SOFTMAX: + if (node->nn_param.log_softmax.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_LAYER_NORM: + if (node->nn_param.layernorm.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_REDUCE: + for (i = 0; i < node->nn_param.reduce.axis_num; i++) + { + int index = node->nn_param.reduce.axis[i]; + if (index == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + break; + } + } + break; + case VSI_NN_OP_CONCAT: + if (node->nn_param.concat.axis == inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_TENSORSTACKCONCAT: + if (node->nn_param.tensorstackconcat.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_STACK: + if (node->nn_param.stack.axis == inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_UNSTACK: + if (node->nn_param.unstack.axis == inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_CONCATSHIFT: + if (node->nn_param.concatshift.axis == inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_SPLIT: + if (node->nn_param.split.axis == inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_BATCH2SPACE: + case VSI_NN_OP_SPACE2BATCH: + case VSI_NN_OP_BATCH_NORM: + ret = FALSE; + break; + case VSI_NN_OP_CROP: + if (node->nn_param.crop.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_CUMSUM: + if (node->nn_param.cumsum.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_INSTANCE_NORM: + for (i = 0; i < (uint32_t)node->nn_param.instancenorm.axis_num; i++) + { + int index = node->nn_param.instancenorm.axis[i]; + if (index == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + break; + } + } + break; + case VSI_NN_OP_L2NORMALIZESCALE: + if (node->nn_param.l2normalizescale.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_L2_NORMALIZE: + if (node->nn_param.l2_normalize.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_LPNORM: + if (node->nn_param.lpnorm.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_LRN: + if (node->nn_param.lrn.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_MOMENTS: + for (i = 0; i < (uint32_t)node->nn_param.moments.axis_num; i++) + { + int index = node->nn_param.moments.axis[i]; + if (index == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + break; + } + } + break; + case VSI_NN_OP_REPEAT: + if (node->nn_param.repeat.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_GATHER: + if (node->nn_param.gather.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_GATHER_ELEMENTS: + if (node->nn_param.gather_elements.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_SCATTER_ELEMENTS: + if (node->nn_param.scatter_elements.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_SHUFFLECHANNEL: + if (node->nn_param.shufflechannel.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + case VSI_NN_OP_TOPK: + if (node->nn_param.topk.axis == (int32_t)inputBtachNum - 1) + { + ret = FALSE; + } + break; + default: + break; + } + + return ret; +} + +static vsi_status batchInference_graph +( + vsi_nn_graph_t* graph, + vsi_nn_node_id_t* nodes_list +) +{ + vsi_size_t i, j, k; + vsi_status status; + vsi_bool ret; + vsi_nn_tensor_t* tensor = NULL; + vsi_nn_tensor_t** inputs = NULL; + vsi_nn_tensor_t** outputs = NULL; + vsi_nn_tensor_attr_t* original_inputs_attr = NULL; + vsi_nn_tensor_attr_t* original_outputs_attr = NULL; + vsi_nn_tensor_id_t* approximateConstTensor = NULL; + vsi_size_t approximateConstTensor_count = 0; + vsi_bool has_inputTensor = FALSE; + vsi_nn_node_id_t node_id; + vsi_nn_node_t* node; + vsi_size_t num_of_node_inputs = 0; + vsi_size_t batchCount = 0; + vsi_size_t batchNum = 1; + + vx_hardware_caps_params_t hw_param; + vx_context ctx = vxGetContext((vx_reference)graph->g); + + for (i = 0; i < graph->node_num; i++) + { + node_id = nodes_list[i]; + node = vsi_nn_GetNode(graph, node_id); + /* For NBG node, donot infer shape*/ + if (node && node->op == VSI_NN_OP_NBG) + { + status = VSI_SUCCESS; + goto final; + } + } + + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t)); + status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t)); + + /*initial tensor shape*/ + status = setup_node(graph, nodes_list); + if (VSI_SUCCESS != status) + { + goto final; + } + + status = VSI_SUCCESS; + ret = TRUE; + inputs = allocate_io_buffer(graph); + outputs = allocate_io_buffer(graph); + original_inputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io); + original_outputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io); + approximateConstTensor = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * graph->tensor_num); + memset(approximateConstTensor, -1, sizeof(vsi_nn_tensor_id_t) * graph->tensor_num); + + if (NULL == inputs || NULL == outputs || NULL == original_inputs_attr || NULL == original_outputs_attr) + { + VSILOGE("allocate buffer fail"); + status = VSI_FAILURE; + goto final; + } + + for (i = 0; i < graph->node_num; i++) + { + node_id = nodes_list[i]; + memset(inputs, 0, graph->max_node_io * sizeof(vsi_nn_tensor_t*)); + memset(outputs, 0, graph->max_node_io * sizeof(vsi_nn_tensor_t*)); + memset(original_inputs_attr, 0, graph->max_node_io * sizeof(vsi_nn_tensor_attr_t)); + memset(original_outputs_attr, 0, graph->max_node_io * sizeof(vsi_nn_tensor_attr_t)); + + /* Get inputs, outputs. */ + node = vsi_nn_GetNode(graph, node_id); + CHECK_PTR_FAIL_GOTO(node, "Get node fail.", final); + + vsi_nn_GetTensors(graph, node->input.tensors, + node->input.num, inputs); + vsi_nn_GetTensors(graph, node->output.tensors, + node->output.num, outputs); + batchNum = 1; + /*get input batch number*/ + has_inputTensor = FALSE; + for (j = 0; j < node->input.num; j++) + { + vx_bool is_const = FALSE; + if (inputs[j] == NULL) + { + continue; + } + memcpy(&original_inputs_attr[j], &inputs[j]->attr, sizeof(vsi_nn_tensor_attr_t)); + for (k = 0; k < approximateConstTensor_count; k++) + { + if (node->input.tensors[j] == approximateConstTensor[k]) + { + is_const = TRUE; + } + } + if (inputs[j]->attr.is_const != TRUE && is_const != TRUE) + { + has_inputTensor = TRUE; + if (batchNum < inputs[j]->attr.size[inputs[j]->attr.dim_num - 1]) + { + batchNum = inputs[j]->attr.size[inputs[j]->attr.dim_num - 1]; + } + } + } + + for (j = 0; j < node->output.num; j++) + { + if (outputs[j] == NULL) + { + continue; + } + memcpy(&original_outputs_attr[j], &outputs[j]->attr, sizeof(vsi_nn_tensor_attr_t)); + if (!has_inputTensor) + { + approximateConstTensor[approximateConstTensor_count++] = node->output.tensors[j]; + } + if (original_outputs_attr[j].dim_num < 1) + { + break; + } + } + if (j != node->output.num) + { + continue; + } + + if (batchNum > 1 && canBatchSplit(node, original_inputs_attr[0].dim_num)) + { + vsi_size_t iterator_list_index = 0; + vsi_size_t list_index = 0; + vsi_size_t* iterator_list = (vsi_size_t*)malloc(sizeof(vsi_size_t) * (batchNum + 1)); + memset(iterator_list, 0, sizeof(uint32_t) * (batchNum + 1)); + + if (((vsi_nn_node_prv_t*)node)->split_num > 0) + {/*user defined batch count*/ + iterator_list[iterator_list_index++] = ((vsi_nn_node_prv_t*)node)->split_num; + if (((vsi_nn_node_prv_t*)node)->split_num == 1) + {/*if user set split_num = 1, there is no need to batch split.*/ + continue; + } + } + /*iterate through each vaild batch count*/ + for (batchCount = batchNum; batchCount > 1; batchCount--) + { + + /*for some node with big batch num, should limit to max core count.*/ + if (batchCount > (hw_param.coreCount == 0?24 : hw_param.coreCount)) + { + continue; + } + if (batchNum % batchCount != 0) + { + continue; + } + iterator_list[iterator_list_index++] = batchCount; + } + + /*iterate through each vaild batch count*/ + for (list_index = 0; list_index < iterator_list_index; list_index++) + { + batchCount = iterator_list[list_index]; + + /*set node input batch*/ + num_of_node_inputs = node->input.num; + for (k = 0; k < num_of_node_inputs; k++) + { + tensor = inputs[k]; + if (tensor) + { + vx_bool is_const = FALSE; + uint32_t index = 0; + for (index = 0; index < approximateConstTensor_count; index++) + { + if (node->input.tensors[k] == approximateConstTensor[index]) + { + is_const = TRUE; + } + } + if (is_const != TRUE && tensor->attr.is_const != TRUE) + { + if (original_inputs_attr[k].size[tensor->attr.dim_num - 1] / batchCount < 1 + || original_inputs_attr[k].size[tensor->attr.dim_num - 1] % batchCount != 0) + { + break; + } + else + { + tensor->attr.size[tensor->attr.dim_num - 1] = + original_inputs_attr[k].size[tensor->attr.dim_num - 1] / batchCount; + } + } + } + } + if (k != num_of_node_inputs) + { + continue; + } + + /*reset output tensor size, dim_num and other parameter, + if not, it will affect vsi_nn_OpGenerateTensor*/ + for (j = 0; j < node->output.num; j++) + { + if (outputs[j] == NULL) + { + continue; + } + outputs[j]->attr.dim_num = VSI_NN_DIM_AUTO; + for (k = 0; k < VSI_NN_MAX_DIM_NUM; k++) + { + outputs[j]->attr.size[k] = 0; + } + } + if (node->internal_node_wksp != NULL) + { + vsi_nn_internal_init_node_wksp(node); + } + + /*node shape inference: */ + if (vsi_nn_OpCheck(node->op, node, inputs, outputs)) + { + vsi_nn_print_node_io(graph, node, 0x01); + ret = vsi_nn_OpGenerateTensor(node, inputs, outputs); + if (ret != TRUE) + { + VSILOGD("Cannot split node[%u] %s on input_batch_count=%u", + node_id, vsi_nn_OpGetName(node->op), batchCount); + continue; + } + vsi_nn_print_node_io(graph, node, 0x02); + + /*check if the node can be splited on batch*/ + for (j = 0; j < node->output.num; j++) + { + if (outputs[j] == NULL) + { + continue; + } + + tensor = outputs[j]; + /*can be splited if the batch dim size of the output shape is changed.*/ + if (tensor->attr.size[tensor->attr.dim_num - 1] == + original_outputs_attr[j].size[original_outputs_attr[j].dim_num - 1]) + { + VSILOGD("Cannot split node[%u] %s on input_batch_count=%u", + node_id, + vsi_nn_OpGetName(node->op), + batchCount); + break; + } + } + + if (j == node->output.num ) + { + /*save the verified batch count*/ + ((vsi_nn_node_prv_t*)node)->split_num = batchCount; + break; + } + } + else + { + VSILOGD("Cannot split node[%u] %s on input_batch_count=%u", + node_id, + vsi_nn_OpGetName(node->op), + batchCount); + continue; + } + } + + /*restore node input batch number*/ + num_of_node_inputs = node->input.num; + for (k = 0; k < num_of_node_inputs; k++) + { + tensor = inputs[k]; + if (tensor) + { + tensor->attr.size[tensor->attr.dim_num - 1] = + original_inputs_attr[k].size[tensor->attr.dim_num - 1] ; + } + } + + /*reset the output tensors*/ + for (j = 0; j < node->output.num; j++) + { + if (outputs[j] == NULL) + { + continue; + } + outputs[j]->attr.dim_num = VSI_NN_DIM_AUTO; + for (k = 0; k < VSI_NN_MAX_DIM_NUM; k++) + { + outputs[j]->attr.size[k] = 0; + } + } + if (node->internal_node_wksp != NULL) + { + vsi_nn_internal_init_node_wksp(node); + } + + /*restore node output shape*/ + if (vsi_nn_OpCheck(node->op, node, inputs, outputs)) + { + ret = vsi_nn_OpGenerateTensor(node, inputs, outputs); + } + } + } + + final: + for (i = 0; i < graph->node_num; i++) + { + node_id = nodes_list[i]; + node = vsi_nn_GetNode(graph, node_id); + if (node == NULL || node->op == VSI_NN_OP_NBG) + { + break; + } + + vsi_nn_GetTensors(graph, node->input.tensors, + node->input.num, inputs); + vsi_nn_GetTensors(graph, node->output.tensors, + node->output.num, outputs); + for (j = 0; j < node->output.num; j++) + { + if (outputs[j] == NULL) + { + continue; + } + /*reset attr->size*/ + outputs[j]->attr.dim_num = VSI_NN_DIM_AUTO; + for (k = 0; k < VSI_NN_MAX_DIM_NUM; k++) + { + outputs[j]->attr.size[k] = 0; + } + } + if (node->internal_node_wksp != NULL) + { + vsi_nn_internal_init_node_wksp(node); + } + } + + free_io_buffer(inputs); + free_io_buffer(outputs); + + if (original_inputs_attr != NULL) + { + free(original_inputs_attr); + } + if (original_outputs_attr != NULL) + { + free(original_outputs_attr); + } + if (approximateConstTensor != NULL) + { + free(approximateConstTensor); + } + + return status; +} /* batchInference_graph() */ + +static vsi_status update_vxnode_batchNum +( + vsi_nn_graph_t* graph, + vsi_nn_node_id_t* node_list +) +{ + uint32_t i, j; + vsi_status status; + vsi_nn_node_id_t node_id; + vsi_nn_node_t* node; + vsi_nn_internal_node_t* inode; + + status = VSI_SUCCESS; + for (i = 0; i < graph->node_num; i++) + { + node_id = node_list[i]; + node = vsi_nn_GetNode(graph, node_id); + CHECK_PTR_FAIL_GOTO(node, "Get node fail.", final); + if (node->n != NULL) + { + vxSetNodeBatch(node->n, (uint32_t)((vsi_nn_node_prv_t*)node)->split_num); + if (((vsi_nn_node_prv_t*)node)->split_num > 1) + { + VSILOGD("split node[%u] %s to %ds on batch dim", + node_id, + vsi_nn_OpGetName(node->op), + ((vsi_nn_node_prv_t*)node)->split_num); + } + } + + for (j = 1; j < 100; j++) + { + inode = vsi_nn_internal_get_node_by_uid(node, j); + if (inode == NULL) + { + break; + } + else + { + if (inode->node->n != NULL) + { + vxSetNodeBatch(inode->node->n, (uint32_t)((vsi_nn_node_prv_t*)node)->split_num); + } + } + } + + } + + final: + return status; +} /* update_vxnode_batchNum() */ +#endif + +vsi_status vsi_nn_InferShape +( + vsi_nn_graph_t* graph +) +{ + uint32_t i, j, k; + vsi_status status; + vsi_nn_tensor_t** outputs = NULL; + vsi_nn_node_t* node; + vsi_nn_node_id_t* nodes_list = NULL; + status = VSI_SUCCESS; + + for (i = 0; i < graph->node_num; i++) + { + node = vsi_nn_GetNode(graph, i); + /* For NBG node, donot infer shape*/ + if (node && node->op == VSI_NN_OP_NBG) + { + status = VSI_FAILURE; + goto final; + } + } + + outputs = allocate_io_buffer(graph); + if (NULL == outputs) + { + VSILOGE("allocate buffer fail"); + status = VSI_FAILURE; + goto final; + } + + /*reset all nodes' output shape*/ + for (i = 0; i < graph->node_num; i++) + { + memset(outputs, 0, graph->max_node_io * sizeof(vsi_nn_tensor_t*)); + node = vsi_nn_GetNode(graph, i); + CHECK_PTR_FAIL_GOTO(node, "Get node fail.", final); + + vsi_nn_GetTensors(graph, node->output.tensors, + node->output.num, outputs); + CHECK_PTR_FAIL_GOTO(outputs, "Get node's output fail.", final); + for (j = 0; j < node->output.num; j++) + { + if (outputs[j] == NULL) + { + continue; + } + /*reset attr->size*/ + outputs[j]->attr.dim_num = VSI_NN_DIM_AUTO; + for (k = 0; k < VSI_NN_MAX_DIM_NUM; k++) + { + outputs[j]->attr.size[k] = 0; + } + } + if (node->internal_node_wksp != NULL) + { + vsi_nn_internal_init_node_wksp(node); + } + } + + /*setup nodes.*/ + nodes_list = (vsi_nn_node_id_t*)malloc( + graph->node_num * sizeof(vsi_nn_node_id_t)); + if (!nodes_list) + { + goto final; + } + for (i = 0; i < graph->node_num; i++) + { + nodes_list[i] = i; + } + + status = setup_node(graph, nodes_list); + if (VSI_SUCCESS != status) + { + goto final; + } + + final: + free_io_buffer(outputs); + if (NULL != nodes_list) + { + free(nodes_list); + } + + return status; +} + static vsi_status set_graph_precision ( vsi_nn_graph_t * graph, @@ -809,6 +1495,18 @@ vsi_status vsi_nn_SetupGraph goto final; } +#if VX_GRAPH_BATCH_OPT_SUPPORT + if (graph->ctx->options.enable_batch_opt) + { + /*processing batch splitting*/ + status = batchInference_graph(graph, nodes_list); + if (VSI_SUCCESS != status) + { + goto final; + } + } +#endif + /* Preprocess node and tensor */ status = setup_node( graph, nodes_list ); if(VSI_SUCCESS != status) @@ -838,6 +1536,14 @@ vsi_status vsi_nn_SetupGraph goto final; } +#if VX_GRAPH_BATCH_OPT_SUPPORT + /* update vxnode's batch_count */ + status = update_vxnode_batchNum(graph, nodes_list); + if (VSI_SUCCESS != status) + { + goto final; + } +#endif /* set precision again to make sure any tensor created by compute_node have correct precesion infor*/ status = set_graph_precision(graph, nodes_list); if(VSI_SUCCESS != status) @@ -1011,7 +1717,8 @@ static vsi_nn_tensor_id_t _add_tensor vsi_nn_graph_t * graph, vsi_nn_tensor_id_t id, vsi_nn_tensor_attr_t * attr, - uint8_t * data + uint8_t * data, + int8_t is_from_axisram ) { vsi_nn_tensor_t * tensor; @@ -1043,11 +1750,26 @@ static vsi_nn_tensor_id_t _add_tensor } else if( NULL != data ) { - tensor = vsi_nn_CreateTensorFromData( graph, data, attr ); + if (TRUE == is_from_axisram) + { + VSILOGE("Can't create a tensor from AXI-SRAM with data."); + } + else + { + tensor = vsi_nn_CreateTensorFromData( graph, data, attr ); + } } else { - tensor = vsi_nn_CreateTensor( graph, attr ); + if (TRUE == is_from_axisram) + { + tensor = vsi_nn_CreateTensorFromAXISRAM(graph, attr); + } + else + { + tensor = vsi_nn_CreateTensor(graph, attr); + } + } if( NULL != tensor ) @@ -1071,7 +1793,7 @@ vsi_nn_tensor_id_t vsi_nn_AddTensor ) { attr->is_created_from_handle = FALSE; - return _add_tensor(graph, id, attr, data); + return _add_tensor(graph, id, attr, data, FALSE); } /* vsi_nn_AddTensor() */ vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle @@ -1083,7 +1805,7 @@ vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle ) { attr->is_created_from_handle = TRUE; - return _add_tensor(graph, id, attr, data); + return _add_tensor(graph, id, attr, data, FALSE); } vsi_nn_tensor_id_t vsi_nn_AddTensorFromView @@ -1116,7 +1838,7 @@ vsi_nn_tensor_id_t vsi_nn_AddTensorFromView { attr.size[i] = end[i] - start[i]; } - id = _add_tensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL); + id = _add_tensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL, FALSE); if (VSI_NN_TENSOR_ID_NA == id) { VSILOGE("Create view tensor failed, new tensor could not be created."); @@ -1150,6 +1872,16 @@ final: return id; } +vsi_nn_tensor_id_t vsi_nn_AddTensorFromAXISRAM + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_id_t id, + vsi_nn_tensor_attr_t * attr + ) +{ + return _add_tensor(graph, id, attr, NULL, TRUE); +} /* vsi_nn_AddTensorFromAXISRAM() */ + vsi_nn_tensor_id_t vsi_nn_AttachTensorToGraph ( vsi_nn_graph_t * graph, diff --git a/src/tim/vx/internal/src/vsi_nn_log.c b/src/tim/vx/internal/src/vsi_nn_log.c index 25d421b..f617359 100644 --- a/src/tim/vx/internal/src/vsi_nn_log.c +++ b/src/tim/vx/internal/src/vsi_nn_log.c @@ -29,37 +29,11 @@ #include "vsi_nn_log.h" #include "vsi_nn_types.h" -#ifdef __ANDROID__ -#if ANDROID_SDK_VERSION >= 30 +#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30) static const char* ENV_LOG_LEVEL = "vendor.VSI_NN_LOG_LEVEL"; #else static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL"; #endif -#else -static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL"; -#endif - -int get_env_as_int(const char* env, int default_value) { - int value = default_value; - #ifdef __ANDROID__ - { - char value_str[100]; - int status = __system_property_get(env, value_str); - if (status) { - value = atoi(value_str); - } - } - #else - { - char* env_s = vsi_nn_getenv(env); - if (env_s) { - value = atoi(env_s); - } - } - #endif - - return value; -} static vsi_bool _check_log_level ( @@ -70,7 +44,7 @@ static vsi_bool _check_log_level if(env_level == VSI_NN_LOG_UNINIT) { - env_level = (vsi_nn_log_level_e)get_env_as_int(ENV_LOG_LEVEL, VSI_NN_LOG_WARN); + env_level = (vsi_nn_log_level_e)vsi_nn_getenv_asint(ENV_LOG_LEVEL, VSI_NN_LOG_WARN); } if(env_level >= level) diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c index 641888e..a284a27 100644 --- a/src/tim/vx/internal/src/vsi_nn_node.c +++ b/src/tim/vx/internal/src/vsi_nn_node.c @@ -212,6 +212,7 @@ void vsi_nn_PrintNode } count += temp; } + count --; temp = snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, "%s", " ], [out:" ); if ( temp >= _MAX_PRINT_BUF_SZ - count || temp == -1 ) @@ -224,7 +225,7 @@ void vsi_nn_PrintNode { /* -3 means reserve memory for ending symbols --" ]" */ temp = snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count - 3, - " %d,", node->input.tensors[i] ); + " %d,", node->output.tensors[i] ); if ( temp >= _MAX_PRINT_BUF_SZ - count - 3 || temp == -1 ) { is_out_of_bound = TRUE; @@ -232,6 +233,7 @@ void vsi_nn_PrintNode } count += temp; } + count --; count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, "%s", " ]" ); final: @@ -243,6 +245,26 @@ final: VSILOGI( "(%16s)node[%u] %s [%08x]", vsi_nn_OpGetName(node->op), id, buf, node->n ); } /* vsi_nn_PrintNode() */ +#if VX_GRAPH_BATCH_OPT_SUPPORT +vsi_status vsi_nn_SetNodeBatchSplitNum +( + vsi_nn_node_t* node, + int8_t split_num +) +{ + vsi_status status = VSI_SUCCESS; + if (node == NULL || split_num < 1) + { + status = VSI_FAILURE; + goto final; + } + ((vsi_nn_node_prv_t*)node)->split_num = split_num; + + final: + return status; +} +#endif + vsi_status vsi_nn_update_node_attr ( vsi_nn_node_t *node diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c index ca565da..4a9caea 100644 --- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -207,6 +207,7 @@ static _node_template s_template[] = /* LPNORM */ NULL, /* RESIZE_3D */ NULL, /* REDUCEL2 */ NULL, + /* CROP_AND_RESIZE */ NULL, }; //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index 3a9ac63..c6e9daa 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -89,7 +89,9 @@ static void _create_multi_norm_tensors multi_input_tensors[2] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL); } else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || - *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21) + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR) { uv_input_attr = *input_attr; uv_input_attr.size[0] = w; @@ -445,7 +447,9 @@ static void _get_org_graph_inputs i += 2 ; } else if(nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || - nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 ) + nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 || + nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB || + nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR) { i += 1; } @@ -558,7 +562,9 @@ vsi_status vsi_nn_add_single_preproc_node node_input_num = 3; } else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || - *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21) + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR) { node_input_num = 2; } @@ -607,7 +613,9 @@ vsi_status vsi_nn_add_single_preproc_node *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 || *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 || - *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP) + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB || + *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR) { _create_multi_norm_tensors(graph, &input_attr, source_layout, source_format, preproc_inputs); } diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index a333d42..d44ecf8 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -66,7 +66,8 @@ static vsi_nn_tensor_t * _create_tensor ( vsi_nn_graph_t * graph, uint8_t * data, - vsi_nn_tensor_attr_t * attr + vsi_nn_tensor_attr_t * attr, + int8_t is_from_axisram ); static vsi_size_t get_tensor_elements_num @@ -568,6 +569,16 @@ static vsi_bool _init_tensor { tensor->t = vxCreateTensor2( graph->ctx->c, ¶ms, sizeof( vx_tensor_create_params_t ) ); +#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT + if (TRUE == _get_tensor_is_from_axisram((vsi_nn_tensor_prv_t*)tensor)) + { + vx_enum pool_type = VX_VIV_MEM_POOL_TYPE_AXI_SRAM; + vxSetTensorAttribute(tensor->t, + VX_TENSOR_MEMORY_POOL_TYPE, + &pool_type, + sizeof(vx_enum)); + } +#endif } else { @@ -596,16 +607,21 @@ static vsi_bool _init_tensor if( !tensor->attr.vtl && !tensor->attr.is_const ) { //norm tensor need to fill initial value - if( ( !tensor->attr.is_created_from_handle ) || tensor->attr.is_handle_malloc_by_ovxlib ) +#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT + if (TRUE != _get_tensor_is_from_axisram((vsi_nn_tensor_prv_t*)tensor)) +#endif { - vsi_nn_FillTensorWithValue( graph, tensor, 0.0f ); - if(tensor->attr.is_created_from_handle) + if( ( !tensor->attr.is_created_from_handle ) || tensor->attr.is_handle_malloc_by_ovxlib) { - vsi_status status = vxFlushHandle( (vx_reference)tensor->t ); - if (VSI_SUCCESS != status) + vsi_nn_FillTensorWithValue( graph, tensor, 0.0f ); + if(tensor->attr.is_created_from_handle) { - ret = FALSE; - goto final; + vsi_status status = vxFlushHandle( (vx_reference)tensor->t ); + if (VSI_SUCCESS != status) + { + ret = FALSE; + goto final; + } } } } @@ -654,7 +670,8 @@ static vsi_nn_tensor_t * _create_tensor ( vsi_nn_graph_t * graph, uint8_t * data, - vsi_nn_tensor_attr_t * attr + vsi_nn_tensor_attr_t * attr, + int8_t is_from_axisram ) { vsi_nn_tensor_prv_t * tensor; @@ -673,6 +690,10 @@ static vsi_nn_tensor_t * _create_tensor memset( tensor, 0, sizeof( vsi_nn_tensor_prv_t ) ); memcpy( &tensor->pot.attr, attr, sizeof( vsi_nn_tensor_attr_t ) ); tensor->pot.is_swapped = FALSE; + if (TRUE == is_from_axisram) + { + tensor->is_from_axisram = is_from_axisram; + } if( attr->dim_num != VSI_NN_DIM_AUTO ) { _init_tensor( graph, &tensor->pot, data); @@ -694,7 +715,7 @@ vsi_nn_tensor_t * vsi_nn_CreateTensor ) { attr->is_created_from_handle = FALSE; - return _create_tensor(graph, NULL, attr); + return _create_tensor(graph, NULL, attr, FALSE); } /* vsi_nn_CreateTensor() */ vsi_nn_tensor_t * vsi_nn_CreateTensorFromHandle @@ -727,7 +748,7 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorFromHandle } else { - ptensor = _create_tensor(graph, data, attr); + ptensor = _create_tensor(graph, data, attr, FALSE); } final: @@ -3115,6 +3136,39 @@ vsi_status _set_tensor_is_scalar return status; } +int8_t _get_tensor_is_from_axisram + ( + vsi_nn_tensor_prv_t* tensor + ) +{ + int8_t is_from_axisram = FALSE; + if (NULL == tensor) { + VSILOGE("To get is_scalar, tensor pointer SHOULD NOT be NULL."); + goto final; + } + is_from_axisram = tensor->is_from_axisram; + +final: + return is_from_axisram; +} + +vsi_status _set_tensor_is_from_axisram + ( + vsi_nn_tensor_prv_t* tensor, + int8_t is_from_axisram + ) +{ + vsi_status status = VSI_SUCCESS; + if (NULL == tensor) { + status = VSI_FAILURE; + goto final; + } + tensor->is_from_axisram = is_from_axisram; + +final: + return status; +} + static vsi_bool _init_dummy_tensor ( vsi_nn_graph_t * graph, @@ -3314,3 +3368,106 @@ vsi_nn_tensor_t * vsi_nn_create_dummy_tensor attr->is_created_from_handle = FALSE; return _create_dummy_tensor(graph, attr); } /* vsi_nn_create_dummy_tensor() */ + +vsi_status vsi_nn_MapTensorPatch + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor, + void** ptr, + vsi_nn_accessor_type_e usage + ) +{ + vsi_status status = VSI_FAILURE; +#ifdef VSI_MAP_TENSOR_PATCH_SUPPORT + size_t dim, i; + vsi_size_t tem_stride[VSI_NN_MAX_DIM_NUM]; + vx_size start[VSI_NN_MAX_DIM_NUM], end[VSI_NN_MAX_DIM_NUM], + stride[VSI_NN_MAX_DIM_NUM]; + vx_map_id map_id = 0; + + if (NULL == graph || NULL == tensor || NULL == ptr) + { + VSILOGE("Invalid parameter"); + return status; + } + if (TRUE == tensor->attr.vtl) + { + VSILOGE("Can not access a virtual tensor."); + return status; + } + vsi_nn_GetStrideSize(&tensor->attr, tem_stride); + + memset(start, 0, sizeof(vx_size) * VSI_NN_MAX_DIM_NUM); + dim = (size_t)tensor->attr.dim_num; + for (i = 0; i < dim; i++) + { + end[i] = (size_t)tensor->attr.size[i]; + stride[i] = (size_t)tem_stride[i]; + } + + status = vxMapTensorPatch(tensor->t,dim,start,end, + &map_id,stride,ptr,usage, VX_MEMORY_TYPE_HOST); + ((vsi_nn_tensor_prv_t*)tensor)->map_id = map_id; +#else + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(tensor); + VSI_UNREFERENCED(ptr); + VSI_UNREFERENCED(usage); + VSILOGE("Function unspported, please upgrade OpenVX driver to 1.3.0!"); +#endif + return status; +} /* vsi_nn_MapTensorPatch() */ + +vsi_status vsi_nn_UnmapTensorPatch + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor + ) +{ + vsi_status status = VSI_FAILURE; +#ifdef VSI_MAP_TENSOR_PATCH_SUPPORT + vx_map_id map_id = 0; + + if (NULL == graph || NULL == tensor) + { + VSILOGE("Invalid parameter"); + return status; + } + if (TRUE == tensor->attr.vtl) + { + VSILOGE("Can not access a virtual tensor."); + return status; + } + + map_id = ((vsi_nn_tensor_prv_t*)tensor)->map_id; + status = vxUnmapTensorPatch(tensor->t, map_id); +#else + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(tensor); + VSILOGE("Function unspported, please upgrade OpenVX driver to 1.3.0!"); +#endif + return status; +} /* vsi_nn_UnmapTensorPatch() */ + +vsi_nn_tensor_t * vsi_nn_CreateTensorFromAXISRAM + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_attr_t * attr + ) +{ + + if (NULL == graph || NULL == attr) { + VSILOGE("Invalid parameter"); + return NULL; + } + if (TRUE == attr->vtl) { + VSILOGE("Can not create tensor from AXI-SRAM for a virtual tensor."); + return NULL; + } +#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT + attr->is_created_from_handle = FALSE; + return _create_tensor(graph, NULL, attr, TRUE); +#else + return NULL; +#endif +} /*vsi_nn_CreateTensorFromAXISRAM*/ diff --git a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h index 1937569..c041c65 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h +++ b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h @@ -75,6 +75,17 @@ vsi_status _set_tensor_is_scalar int8_t is_salar ); +int8_t _get_tensor_is_from_axisram + ( + vsi_nn_tensor_prv_t* tensor + ); + +vsi_status _set_tensor_is_from_axisram + ( + vsi_nn_tensor_prv_t* tensor, + int8_t is_from_axisram + ); + /** * Create a new dummy tensor * Create a new dummy tensor with given attributes. @@ -107,6 +118,15 @@ vsi_bool vsi_nn_is_same_quant_type( vsi_nn_tensor_t * dst ); +vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * in_tensor, + vsi_size_t * shape, + uint32_t dim_num, + vsi_nn_opt_direction_e direction + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/src/vsi_nn_types_prv.h b/src/tim/vx/internal/src/vsi_nn_types_prv.h index 81b1d36..00b55fd 100644 --- a/src/tim/vx/internal/src/vsi_nn_types_prv.h +++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h @@ -73,6 +73,10 @@ typedef struct _vsi_nn_node_prv int8_t processed; // Add node internal attribute here... +#if VX_GRAPH_BATCH_OPT_SUPPORT + /*split the node to "split_num" on batch dim.*/ + vsi_size_t split_num; +#endif } vsi_nn_node_prv_t; /** @@ -95,6 +99,14 @@ typedef struct _vsi_nn_tensor_prv * be done more than once */ int8_t processed; + /** For mapping tensor patch. + * map_id The address of a vx_map_id variable where the function returns a map identifier. + */ + vx_map_id map_id; + + /** create tensor from axisram.*/ + int8_t is_from_axisram; + // Add tensor internal attribute here... } vsi_nn_tensor_prv_t;