From 161bb8a7c4ab6a84b28cecce08a71804ca60786a Mon Sep 17 00:00:00 2001 From: Zhouheng Zheng Date: Tue, 1 Mar 2022 17:56:03 +0800 Subject: [PATCH] Pre-release for 22Q1 (#302) update internal to commit-id: d45da6fa Co-authored-by: zhouheng.zheng --- .../include/custom/custom_node_type.def | 3 + .../vx/internal/include/custom/custom_ops.def | 3 + ...si_nn_op_custom_ainr_denoise_postprocess.h | 47 + .../custom/ops/vsi_nn_op_custom_warp_affine.h | 49 + .../ops/vsi_nn_op_custom_warp_perspective.h | 50 + .../include/custom/vsi_nn_custom_node_type.h | 3 + src/tim/vx/internal/include/interface/ops.def | 3 + .../include/internal/internal_ops.def | 1 + .../internal/include/kernel/vsi_nn_kernel.h | 14 + .../kernel/vsi_nn_kernel_gpu_shape_optimize.h | 6 + .../internal/include/ops/vsi_nn_op_deconv3d.h | 54 + .../internal/include/ops/vsi_nn_op_gather.h | 1 + .../vx/internal/include/ops/vsi_nn_op_pad2.h | 50 + .../internal/include/ops/vsi_nn_op_reduce.h | 2 +- .../ops/vsi_nn_op_reduce_mean_internal.h | 49 + .../vx/internal/include/utils/vsi_nn_util.h | 26 + .../vx/internal/include/vsi_nn_client_op.h | 11 + src/tim/vx/internal/include/vsi_nn_context.h | 1 + .../internal/include/vsi_nn_feature_config.h | 23 - src/tim/vx/internal/include/vsi_nn_graph.h | 23 + src/tim/vx/internal/include/vsi_nn_log.h | 9 +- .../vx/internal/include/vsi_nn_node_type.h | 8 +- src/tim/vx/internal/include/vsi_nn_ops.h | 9 +- src/tim/vx/internal/include/vsi_nn_version.h | 2 +- .../ops/kernel/cpu/custom_softmax_cpu.c | 2 +- .../ops/kernel/cpu/custom_warp_affine_cpu.c | 296 ++ .../kernel/cpu/custom_warp_perspective_cpu.c | 300 ++ .../ops/kernel/evis/custom_warp_affine_evis.c | 295 ++ .../evis/custom_warp_perspective_evis.c | 300 ++ .../ops/op_custom_ainr_denoise_postprocess.c | 136 + .../custom/ops/vsi_nn_op_custom_warp_affine.c | 136 + .../ops/vsi_nn_op_custom_warp_perspective.c | 136 + src/tim/vx/internal/src/kernel/cl/clip_cl.c | 18 +- .../src/kernel/cl/depth2space_internal_cl.c | 226 + .../internal/src/kernel/cl/eltwise_unary_cl.c | 10 + .../vx/internal/src/kernel/cl/floordiv_cl.c | 79 +- src/tim/vx/internal/src/kernel/cl/gather_cl.c | 114 +- .../src/kernel/cl/group_normalization_cl.c | 44 +- .../src/kernel/cl/instance_normalization_cl.c | 6 +- .../vx/internal/src/kernel/cl/moments_cl.c | 5 + src/tim/vx/internal/src/kernel/cl/topk_cl.c | 301 ++ .../src/kernel/cpu/eltwise_unary_cpu.c | 10 + .../vx/internal/src/kernel/cpu/gather_cpu.c | 76 +- .../internal/src/kernel/cpu/gather_nd_cpu.c | 5 +- .../kernel/cpu/instance_normalization_cpu.c | 100 +- .../src/kernel/cpu/resize_bilinear_cpu.c | 2 - .../vx/internal/src/kernel/evis/argmax_evis.c | 77 +- .../src/kernel/evis/comparisons_evis.c | 30 +- .../kernel/evis/depth2space_internal_evis.c | 11 + .../src/kernel/evis/eltwise_unary_evis.c | 27 + .../vx/internal/src/kernel/evis/gather_evis.c | 147 +- .../kernel/evis/group_normalization_evis.c | 43 +- .../kernel/evis/instance_normalization_evis.c | 7 +- .../src/kernel/evis/logical_ops_evis.c | 28 +- .../internal/src/kernel/evis/matrixmul_evis.c | 50 + .../internal/src/kernel/evis/maximum_evis.c | 47 +- .../internal/src/kernel/evis/minimum_evis.c | 47 +- .../internal/src/kernel/evis/moments_evis.c | 94 + .../internal/src/kernel/evis/one_hot_evis.c | 87 +- .../vx/internal/src/kernel/evis/prelu_evis.c | 4 +- .../src/kernel/evis/resize_bilinear_evis.c | 120 +- .../src/kernel/evis/scatter_nd_evis.c | 38 + .../vx/internal/src/kernel/vsi_nn_kernel.c | 321 +- .../kernel/vsi_nn_kernel_gpu_shape_optimize.c | 44 +- .../vx/internal/src/kernel/vx/batch_norm_vx.c | 84 + .../vx/internal/src/kernel/vx/convolutional.c | 92 +- src/tim/vx/internal/src/kernel/vx/pad2_vx.c | 113 + .../internal/src/libnnext/ops/cl/clip_BF16.cl | 37 + .../src/libnnext/ops/cl/depth2space_crd.cl | 17 + .../src/libnnext/ops/cl/eltwise_unary.cl | 9 + .../internal/src/libnnext/ops/cl/floordiv.cl | 186 +- .../vx/internal/src/libnnext/ops/cl/gather.cl | 12 +- .../src/libnnext/ops/cl/gather_batch.cl | 123 + .../src/libnnext/ops/cl/moments_axis0.cl | 42 + .../src/libnnext/ops/cl/moments_axis01.cl | 60 + .../src/libnnext/ops/cl/moments_axis012.cl | 61 + .../src/libnnext/ops/cl/moments_axis1.cl | 41 + .../src/libnnext/ops/cl/moments_axis2.cl | 44 +- .../vx/internal/src/libnnext/ops/cl/topk.cl | 251 ++ .../src/libnnext/ops/vx/argmax_axis2.vx | 101 +- .../ops/vx}/custom_softmax.vx | 11 +- .../src/libnnext/ops/vx/custom_warp_affine.vx | 353 ++ .../ops/vx/custom_warp_perspective.vx | 395 ++ .../src/libnnext/ops/vx/depth2space_crd.vx | 2 +- .../src/libnnext/ops/vx/eltwise_unary_2d.vx | 18 + .../src/libnnext/ops/vx/eltwise_unary_3d.vx | 18 + .../vx/internal/src/libnnext/ops/vx/gather.vx | 2 - .../src/libnnext/ops/vx/gather_batch.vx | 237 ++ .../src/libnnext/ops/vx/gather_mix_batch.vx | 236 ++ .../src/libnnext/ops/vx/logical_ops.vx | 53 +- .../src/libnnext/ops/vx/matrixmul_bf16.vx | 272 ++ .../src/libnnext/ops/vx/matrixmul_f16.vx | 23 +- .../internal/src/libnnext/ops/vx/maximum.vx | 56 + .../src/libnnext/ops/vx/maximum_i16.vx | 60 + .../internal/src/libnnext/ops/vx/minimum.vx | 56 + .../src/libnnext/ops/vx/minimum_i16.vx | 60 + .../src/libnnext/ops/vx/moments_axis0.vx | 85 + .../src/libnnext/ops/vx/moments_axis012.vx | 78 + .../src/libnnext/ops/vx/moments_axis1.vx | 84 + .../src/libnnext/ops/vx/moments_axis2.vx | 46 + .../src/libnnext/ops/vx/moments_u8_axis012.vx | 142 +- .../internal/src/libnnext/ops/vx/one_hot.vx | 88 + .../src/libnnext/ops/vx/relational_ops_2d.vx | 47 +- .../src/libnnext/ops/vx/relational_ops_3d.vx | 41 + ...esize_bilinear_U8_half_pixel_centers_1.vx} | 0 ...resize_bilinear_U8_half_pixel_centers_2.vx | 129 + .../src/libnnext/ops/vx/scatter_nd.vx | 50 +- .../src/libnnext/vsi_nn_libnnext_resource.c | 3740 ++++++++++++++++- .../internal/src/libnnext/vsi_nn_vxkernel.c | 18 +- src/tim/vx/internal/src/ops/vsi_nn_op_abs.c | 1 + .../internal/src/ops/vsi_nn_op_batch_norm.c | 44 +- .../src/ops/vsi_nn_op_conv2d_lstm_cell.c | 2 +- .../internal/src/ops/vsi_nn_op_dataconvert.c | 6 +- .../vx/internal/src/ops/vsi_nn_op_deconv3d.c | 302 ++ .../src/ops/vsi_nn_op_depth2space_internal.c | 2 + .../vx/internal/src/ops/vsi_nn_op_eltwise.c | 100 +- .../src/ops/vsi_nn_op_eltwise_unary.c | 7 +- .../vx/internal/src/ops/vsi_nn_op_floordiv.c | 6 + .../src/ops/vsi_nn_op_fullconnect_relu.c | 15 +- .../vx/internal/src/ops/vsi_nn_op_gather.c | 20 +- .../src/ops/vsi_nn_op_grouped_conv2d.c | 6 + .../src/ops/vsi_nn_op_instancenormalize.c | 211 +- .../src/ops/vsi_nn_op_l2normalizescale.c | 5 +- .../vx/internal/src/ops/vsi_nn_op_linear.c | 4 +- .../internal/src/ops/vsi_nn_op_logical_ops.c | 1 + .../src/ops/vsi_nn_op_lstmunit_ovxlib.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_matrixmul.c | 1 + .../vx/internal/src/ops/vsi_nn_op_moments.c | 2 + .../vx/internal/src/ops/vsi_nn_op_one_hot.c | 1 + src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c | 198 + .../vx/internal/src/ops/vsi_nn_op_reduce.c | 185 +- .../src/ops/vsi_nn_op_reduce_mean_internal.c | 163 + .../internal/src/ops/vsi_nn_op_scatter_nd.c | 1 + .../src/ops/vsi_nn_op_softmax_internal.c | 6 +- .../vx/internal/src/ops/vsi_nn_op_square.c | 1 + src/tim/vx/internal/src/ops/vsi_nn_op_stack.c | 10 + .../src/ops/vsi_nn_op_strided_slice.c | 1 + .../src/utils/vsi_nn_code_generator.c | 6 +- src/tim/vx/internal/src/utils/vsi_nn_dtype.c | 12 + src/tim/vx/internal/src/utils/vsi_nn_util.c | 107 +- src/tim/vx/internal/src/vsi_nn_client_op.c | 39 + src/tim/vx/internal/src/vsi_nn_context.c | 9 +- src/tim/vx/internal/src/vsi_nn_graph.c | 71 +- .../internal/src/vsi_nn_graph_optimization.c | 4 +- src/tim/vx/internal/src/vsi_nn_log.c | 3 +- src/tim/vx/internal/src/vsi_nn_node.c | 4 +- src/tim/vx/internal/src/vsi_nn_ops.c | 31 +- .../vx/internal/src/vsi_nn_pre_post_process.c | 37 +- src/tim/vx/internal/src/vsi_nn_tensor.c | 46 +- 149 files changed, 12641 insertions(+), 970 deletions(-) create mode 100644 src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h create mode 100644 src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h create mode 100644 src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c create mode 100644 src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c create mode 100644 src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c create mode 100644 src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c create mode 100644 src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/topk_cl.c create mode 100644 src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c create mode 100644 src/tim/vx/internal/src/kernel/vx/pad2_vx.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/topk.cl rename src/tim/vx/internal/src/{custom/ops/kernel/evis => libnnext/ops/vx}/custom_softmax.vx (83%) create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx rename src/tim/vx/internal/src/libnnext/ops/vx/{resize_bilinear_U8_half_pixel_centers.vx => resize_bilinear_U8_half_pixel_centers_1.vx} (100%) create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_2.vx create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c diff --git a/src/tim/vx/internal/include/custom/custom_node_type.def b/src/tim/vx/internal/include/custom/custom_node_type.def index 034c37f..0283c71 100644 --- a/src/tim/vx/internal/include/custom/custom_node_type.def +++ b/src/tim/vx/internal/include/custom/custom_node_type.def @@ -2,3 +2,6 @@ custom op data struct def */ DEF_NODE_TYPE(custom_softmax) +DEF_NODE_TYPE(custom_ainr_denoise_postprocess) +DEF_NODE_TYPE(custom_warp_affine) +DEF_NODE_TYPE(custom_warp_perspective) diff --git a/src/tim/vx/internal/include/custom/custom_ops.def b/src/tim/vx/internal/include/custom/custom_ops.def index 8ef4d50..690b057 100644 --- a/src/tim/vx/internal/include/custom/custom_ops.def +++ b/src/tim/vx/internal/include/custom/custom_ops.def @@ -2,3 +2,6 @@ Add custom ops to the end. */ DEF_OP(CUSTOM_SOFTMAX) +DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS) +DEF_OP(CUSTOM_WARP_AFFINE) +DEF_OP(CUSTOM_WARP_PERSPECTIVE) diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h new file mode 100644 index 0000000..1a7e623 --- /dev/null +++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_CUSTOM_AINR_DENOISE_POSTPROCESS_H +#define _VSI_NN_OP_CUSTOM_AINR_DENOISE_POSTPROCESS_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_custom_ainr_denoise_postprocess_param +{ + struct _ainr_denoise_postprocess_local_data_t* local; + // Add parameters here +} vsi_nn_custom_ainr_denoise_postprocess_param; +_compiler_assert(offsetof(vsi_nn_custom_ainr_denoise_postprocess_param, local) == 0, \ + vsi_nn_custom_ainr_denoise_postprocess_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h new file mode 100644 index 0000000..815a064 --- /dev/null +++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h @@ -0,0 +1,49 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_CUSTOM_WARP_AFFINE_H +#define _VSI_NN_OP_CUSTOM_WARP_AFFINE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_custom_warp_affine_param +{ + struct _custom_warp_affine_local_data_t* local; + // Add parameters here + const float *matrix; + vsi_enum type; + int32_t size[2]; +} vsi_nn_custom_warp_affine_param; +_compiler_assert(offsetof(vsi_nn_custom_warp_affine_param, local) == 0, \ + vsi_nn_custom_warp_affine_h ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h new file mode 100644 index 0000000..8aceb2a --- /dev/null +++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_perspective.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_CUSTOM_WARP_PERSPECTIVE_H +#define _VSI_NN_OP_CUSTOM_WARP_PERSPECTIVE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_custom_warp_perspective_param +{ + struct _custom_warp_perspective_local_data_t* local; + // Add parameters here + const float *matrix; + vsi_enum type; + int32_t size[2]; +} vsi_nn_custom_warp_perspective_param; +_compiler_assert(offsetof(vsi_nn_custom_warp_perspective_param, local) == 0, \ + vsi_nn_custom_warp_perspective_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h index 16d3d0c..1a05c8a 100644 --- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h +++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h @@ -27,5 +27,8 @@ custom op head files */ #include "custom/ops/vsi_nn_op_custom_softmax.h" +#include "custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h" +#include "custom/ops/vsi_nn_op_custom_warp_affine.h" +#include "custom/ops/vsi_nn_op_custom_warp_perspective.h" #endif diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index cf5bebb..4765bd5 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -165,3 +165,6 @@ DEF_OP(GRUCELL) DEF_OP(GRUCELL_ACTIVATION) DEF_OP(RESHAPE2) DEF_OP(CONV3D) +DEF_OP(DECONV3D) +DEF_OP(PAD2) +DEF_OP(COS) diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def index 06dbc61..a47559a 100644 --- a/src/tim/vx/internal/include/internal/internal_ops.def +++ b/src/tim/vx/internal/include/internal/internal_ops.def @@ -19,3 +19,4 @@ DEF_OP(RESIZE_1D_NEAREST_INTERNAL) DEF_OP(SPACE2DEPTH_INTERNAL) DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R) DEF_OP(GRUCELL_ACTIVATION_Z_H) +DEF_OP(REDUCE_MEAN_INTERNAL) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h index 05222b2..f8163be 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h @@ -640,6 +640,13 @@ vsi_nn_kernel_node_t vsi_nn_kernel_create_node vsi_nn_kernel_t * kernel ); +vsi_nn_kernel_node_t vsi_nn_kernel_create_node_ext + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_t * kernel, + const char** resources + ); + vsi_status vsi_nn_kernel_node_set_border (vsi_nn_kernel_node_t node, vx_border_t* border); @@ -720,6 +727,13 @@ vsi_status vsi_nn_kernel_register vsi_nn_kernel_t * kernel ); +vsi_status vsi_nn_kernel_register_ext + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_t * kernel, + const char** resources + ); + vsi_bool vsi_nn_kernel_gpu_check_shape ( const vsi_size_t * shape, vsi_size_t rank ); diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h index 1f4c947..26a676f 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h @@ -79,4 +79,10 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape vsi_size_t* out_shape, uint32_t* out_rank ); +vsi_bool vsi_nn_kernel_optimize_group_norm_shape + ( + const vsi_size_t* shape, const uint32_t rank, int32_t groups, + int32_t is_sp_kernel, vsi_size_t* out_shape + ); + #endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h new file mode 100644 index 0000000..133267f --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h @@ -0,0 +1,54 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_DECONV3D_H +#define _VSI_NN_OP_DECONV3D_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_deconv3d_param +{ + struct _deconv3d_local_data_t* local; + // Add parameters here + uint32_t ksize[3]; + uint32_t stride[3]; + /* Pad left, right, top, bottom, front, rear */ + uint32_t pad[6]; + + uint32_t weights; + uint32_t group; + uint32_t output_padding[3]; +} vsi_nn_deconv3d_param; +_compiler_assert(offsetof(vsi_nn_deconv3d_param, local) == 0, \ + vsi_nn_deconv3d_h ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h index 0d76800..c9792c9 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gather.h @@ -41,6 +41,7 @@ typedef struct _vsi_nn_gather_param { vsi_nn_gather_lcl_data local; int32_t axis; + int32_t batch_dims; } vsi_nn_gather_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h new file mode 100644 index 0000000..f2672a2 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pad2.h @@ -0,0 +1,50 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_PAD2_H +#define _VSI_NN_OP_PAD2_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_pad2_param +{ + struct _pad2_local_data_t* local; + const uint32_t * front_size; + const uint32_t * back_size; + uint8_t dim_num; + float const_val; + vsi_nn_pad_mode_e mode; +} vsi_nn_pad2_param; +_compiler_assert(offsetof(vsi_nn_pad2_param, local) == 0, \ + vsi_nn_pad2_h ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h index cf7bb8b..57997d2 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce.h @@ -51,7 +51,7 @@ typedef struct _vsi_nn_reduce_param { /* local data must be the first. */ vsi_nn_reduce_lcl_data_t local; - vx_enum type; + vsi_enum type; const int32_t *axis; vx_uint32 axis_num; vx_bool keep_dim; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h new file mode 100644 index 0000000..20eb56c --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h @@ -0,0 +1,49 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_REDUCE_MEAN_INTERNAL_H +#define _VSI_NN_OP_REDUCE_MEAN_INTERNAL_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_reduce_mean_internal_param +{ + struct _reduce_mean_internal_local_data_t* local; + // Add parameters here + vx_int32 *axis; + vx_uint32 axis_num; + float scale; +} vsi_nn_reduce_mean_internal_param; +_compiler_assert(offsetof(vsi_nn_reduce_mean_internal_param, local) == 0, \ + vsi_nn_reduce_mean_internal_h ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h index 7aa984e..8687247 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -28,6 +28,7 @@ /*------------------------------------------- Includes -------------------------------------------*/ +#include #include "vsi_nn_platform.h" #include "vsi_nn_tensor.h" #include "vsi_nn_types.h" @@ -398,6 +399,31 @@ void vsi_nn_get_tensor_clamp_min_max float *clampMax ); +char* vsi_nn_strncpy + ( + char* dest, + const char* source, + size_t count + ); + +char* vsi_nn_strncat + ( + char* dest, + const char* source, + size_t count + ); + +char* vsi_nn_getenv + ( + const char * var_name + ); + +FILE* vsi_nn_fopen + ( + const char * file_name, + const char * mode + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_client_op.h b/src/tim/vx/internal/include/vsi_nn_client_op.h index 856f81b..c166ce7 100644 --- a/src/tim/vx/internal/include/vsi_nn_client_op.h +++ b/src/tim/vx/internal/include/vsi_nn_client_op.h @@ -71,6 +71,17 @@ OVXLIB_API void vsi_nn_OpRemoveClient vsi_nn_op_t op ); +vsi_bool vsi_nn_OpAddClientName + ( + vsi_nn_op_t op, + const char* kernel_name + ); + +const char* vsi_nn_OpGetClientName + ( + vsi_nn_op_t op + ); + #if defined(__cplusplus) } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index 4374441..20a4dd1 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -73,6 +73,7 @@ typedef struct _vsi_nn_runtime_option_t int32_t enable_shader; int32_t enable_opcheck; int32_t enable_concat_optimize; + int32_t enable_asymi8_to_u8; } vsi_nn_runtime_option_t; /** diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h index db38ecc..8906a96 100644 --- a/src/tim/vx/internal/include/vsi_nn_feature_config.h +++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h @@ -1,26 +1,3 @@ -/**************************************************************************** -* -* Copyright (c) 2019 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the Software), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ /*****Auto generated header file, Please DO NOT modify manually!*****/ #ifndef _VSI_NN_FEATURE_CONFIG_H #define _VSI_NN_FEATURE_CONFIG_H diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h index ffb5dd0..dda35b7 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph.h +++ b/src/tim/vx/internal/include/vsi_nn_graph.h @@ -456,6 +456,29 @@ OVXLIB_API vsi_nn_node_t * vsi_nn_AddNode vsi_nn_node_id_t * node_id ); +/** + * Add External node + * Create a new External node and attach it to graph. + * + * @param[in] graph Graph handle + * @param[in] op Node operation. + * @param[in] vsi_nn_proc_t to this node. + * @param[in] output_num Number of outputs to this node. + * @param[in] kernel name. + * @param[out] node_id A handle to get the id of new node, + * pass it to NULL to get nothing. + * + * @return The node handle on success, or NULL otherwise. + */ +OVXLIB_API vsi_nn_node_t * vsi_nn_AddExternalNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op, + const void * proc, + vsi_nn_node_id_t * node_id, + const char *kernel_name + ); + /** * @deprecated * @see vsi_nn_AddNode diff --git a/src/tim/vx/internal/include/vsi_nn_log.h b/src/tim/vx/internal/include/vsi_nn_log.h index fd7d37a..d3afaa2 100644 --- a/src/tim/vx/internal/include/vsi_nn_log.h +++ b/src/tim/vx/internal/include/vsi_nn_log.h @@ -24,14 +24,18 @@ #ifndef _VSI_NN_LOG_H #define _VSI_NN_LOG_H -#include + +#include "utils/vsi_nn_util.h" #if defined(__cplusplus) extern "C"{ #endif #ifdef _MSC_VER -#define snprintf _snprintf +#define snprintf(buffer, count, format, ...) \ + _snprintf_s(buffer, count, _TRUNCATE, format, ##__VA_ARGS__) +#define vsnprintf(buffer, count, format, args) \ + _vsnprintf_s(buffer, count, _TRUNCATE, format, args) #endif typedef enum _vsi_nn_log_level_e @@ -68,4 +72,3 @@ OVXLIB_API void vsi_nn_LogMsg #endif #endif - diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index 0278c4b..395ee2e 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -182,6 +182,9 @@ #include "ops/vsi_nn_op_conv3d.h" #include "ops/vsi_nn_op_grucell_h_times_activation_r.h" #include "ops/vsi_nn_op_grucell_activation_z_h.h" +#include "ops/vsi_nn_op_deconv3d.h" +#include "ops/vsi_nn_op_reduce_mean_internal.h" +#include "ops/vsi_nn_op_pad2.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" @@ -350,7 +353,10 @@ typedef union _vsi_nn_nn_param vsi_nn_conv3d_param conv3d; vsi_nn_grucell_h_times_activation_r_param grucell_h_times_activation_r; vsi_nn_grucell_activation_z_h_param grucell_activation_z_h; - uint8_t client_param[128]; + vsi_nn_deconv3d_param deconv3d; + vsi_nn_reduce_mean_internal_param reduce_mean_internal; + vsi_nn_pad2_param pad2; + void* client_param; /* custom node data struct define */ #define DEF_NODE_TYPE( NAME ) vsi_nn_##NAME##_param NAME; diff --git a/src/tim/vx/internal/include/vsi_nn_ops.h b/src/tim/vx/internal/include/vsi_nn_ops.h index 4c79499..40671e7 100644 --- a/src/tim/vx/internal/include/vsi_nn_ops.h +++ b/src/tim/vx/internal/include/vsi_nn_ops.h @@ -48,7 +48,7 @@ extern "C"{ * @see include/custom/custom_ops.def * @see include/internal/internal_ops.def */ -typedef uint32_t vsi_nn_op_t; enum +typedef int32_t vsi_nn_op_t; enum { #define DEF_OP( NAME, ... ) VSI_NN_OP_##NAME, #include "interface/ops.def" @@ -317,6 +317,13 @@ vsi_bool vsi_nn_OpRegisterOvxInit vsi_nn_op_compute_t compute ); +vsi_bool vsi_nn_OpRegisterExternalOvxInit + ( + vsi_nn_op_t op, + const char* kernel_name, + vsi_nn_op_proc_t* proc + ); + /** * Get operation name * Get operation name string by operation id. diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index b0acac3..328aa19 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 37 +#define VSI_NN_VERSION_PATCH 39 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c index 1eeb997..ed1e149 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c @@ -77,7 +77,7 @@ DEF_KERNEL_EXECUTOR(_softmax_exec) /* alloc the float32 data buffer */ buffer[1] = (float *)malloc(out_elements * sizeof(float)); - CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final ); memset(buffer[1], 0, out_elements * sizeof(float)); buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c new file mode 100644 index 0000000..f2cb031 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c @@ -0,0 +1,296 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.custom_warp_affine") + + +/* + * Kernel params + */ +static vx_param_description_t _custom_warp_affine_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def ) +#define SCALAR_INPUT_TYPE (2) +#define SCALAR_MATRIX_OFFSET (3) + +static void _transform_affine + ( + vsi_size_t dst_x, + vsi_size_t dst_y, + const float m[], + float *src_x, + float *src_y + ) +{ + *src_x = dst_x * m[0] + dst_y * m[2] + m[4]; + *src_y = dst_x * m[1] + dst_y * m[3] + m[5]; +} + +static vsi_bool _read_pixel + ( + float *base, + vsi_nn_kernel_tensor_attr_t *attr, + float x, + float y, + float *pixel + ) +{ + vsi_size_t width = attr->shape->data[0]; + vsi_size_t height = attr->shape->data[1]; + vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height); + vsi_size_t bx = 0, by = 0; + + if (out_of_bounds) + { + *pixel = 205.0f; + return TRUE; + } + + // bounded x/y + bx = x < 0 ? 0 : x >= width ? width - 1 : (vsi_size_t)x; + by = y < 0 ? 0 : y >= height ? height - 1 : (vsi_size_t)y; + + *pixel = base[by * width + bx]; + + return TRUE; +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + float* buffer[_CPU_IO_NUM] = { NULL }; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL }; + int32_t type = 0; + float matrix[6] = {0}; + vsi_size_t i = 0; + vsi_size_t b = 0; + vsi_size_t x = 0; + vsi_size_t y = 0; + vsi_size_t out_elements = 0; + vsi_size_t width = 0; + vsi_size_t height = 0; + vsi_size_t outer_size = 1; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + /* alloc the float32 data buffer */ + buffer[1] = (float *)malloc(out_elements * sizeof(float)); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final ); + memset(buffer[1], 0, out_elements * sizeof(float)); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE], + &type); + CHECK_STATUS_FAIL_GOTO(status, final ); + for (i = 0; i < 6; i++) + { + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], + &matrix[i]); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + width = attr[1]->shape->data[0]; + height = attr[1]->shape->data[1]; + for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i) + { + outer_size *= attr[1]->shape->data[i]; + } + // Do something + for (b = 0; b < outer_size; b++) + { + float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1]; + float *dst_base = buffer[1] + b * width * height; + for (y = 0; y < height; y++) + { + for (x = 0; x < width; x++) + { + float xf = 0; + float yf = 0; + float dst = 0; + + _transform_affine(x, y, matrix, &xf, &yf); + if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR) + { + _read_pixel(src_base, attr[0], xf, yf, &dst); + dst_base[y * width + x] = dst; + } + else + { + float tl = 0, tr = 0, bl = 0, br = 0; + float ar = xf - floorf(xf); + float ab = yf - floorf(yf); + float al = 1.0f - ar; + float at = 1.0f - ab; + + _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl); + _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr); + _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl); + _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br); + + dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab; + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _custom_warp_affine_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _custom_warp_affine_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_AFFINE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + size_t i = 0; + size_t buffer_size = 0; + int32_t type = vsi_nn_kernel_param_get_int32( params, "type"); + float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size ); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_TYPE] = vsi_nn_kernel_scalar_create( + graph, I32, &type ); + for (i = 0; i < buffer_size; i++) + { + node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create( + graph, F32, &buffer[i] ); + } + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TYPE] ); + for (i = 0; i < buffer_size; i++) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( custom_warp_affine, _setup ) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c new file mode 100644 index 0000000..397f022 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c @@ -0,0 +1,300 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.custom_warp_perspective") + + +/* + * Kernel params + */ +static vx_param_description_t _custom_warp_perspective_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM _cnt_of_array( _custom_warp_perspective_kernel_param_def ) +#define SCALAR_INPUT_TYPE (2) +#define SCALAR_MATRIX_OFFSET (3) + +static void _transform_perspective + ( + vsi_size_t dst_x, + vsi_size_t dst_y, + const float m[], + float *src_x, + float *src_y + ) +{ + float z = dst_x * m[2] + dst_y * m[5] + m[8]; + + *src_x = (dst_x * m[0] + dst_y * m[3] + m[6]) / z; + *src_y = (dst_x * m[1] + dst_y * m[4] + m[7]) / z; +} + +static vsi_bool _read_pixel + ( + float *base, + vsi_nn_kernel_tensor_attr_t *attr, + float x, + float y, + float *pixel + ) +{ + vsi_size_t width = attr->shape->data[0]; + vsi_size_t height = attr->shape->data[1]; + vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height); + vsi_size_t bx = 0, by = 0; + + if (out_of_bounds) + { + *pixel = 205.0f; + return TRUE; + } + + // bounded x/y + bx = x < 0 ? 0 : x >= width ? width - 1 : (vsi_size_t)x; + by = y < 0 ? 0 : y >= height ? height - 1 : (vsi_size_t)y; + + *pixel = base[by * width + bx]; + + return TRUE; +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + float* buffer[_CPU_IO_NUM] = { NULL }; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL }; + int32_t type = 0; + float matrix[9] = {0}; + vsi_size_t i = 0; + vsi_size_t b = 0; + vsi_size_t x = 0; + vsi_size_t y = 0; + vsi_size_t out_elements = 0; + vsi_size_t width = 0; + vsi_size_t height = 0; + vsi_size_t outer_size = 1; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + /* alloc the float32 data buffer */ + buffer[1] = (float *)malloc(out_elements * sizeof(float)); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final ); + memset(buffer[1], 0, out_elements * sizeof(float)); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE], + &type); + CHECK_STATUS_FAIL_GOTO(status, final ); + for (i = 0; i < 9; i++) + { + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], + &matrix[i]); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + width = attr[1]->shape->data[0]; + height = attr[1]->shape->data[1]; + for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i) + { + outer_size *= attr[1]->shape->data[i]; + } + // Do something + for (b = 0; b < outer_size; b++) + { + float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1]; + float *dst_base = buffer[1] + b * width * height; + for (y = 0; y < height; y++) + { + for (x = 0; x < width; x++) + { + float xf = 0; + float yf = 0; + float dst = 0; + + _transform_perspective(x, y, matrix, &xf, &yf); + if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR) + { + _read_pixel(src_base, attr[0], xf, yf, &dst); + dst_base[y * width + x] = dst; + } + else + { + float tl = 0, tr = 0, bl = 0, br = 0; + float ar = xf - floorf(xf); + float ab = yf - floorf(yf); + float al = 1.0f - ar; + float at = 1.0f - ab; + + _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl); + _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr); + _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl); + _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br); + + dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab; + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _custom_warp_perspective_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _custom_warp_perspective_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_PERSPECTIVE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + size_t i = 0; + size_t buffer_size = 0; + int32_t type = vsi_nn_kernel_param_get_int32( params, "type"); + float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size ); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_INPUT_TYPE] = vsi_nn_kernel_scalar_create( + graph, I32, &type ); + for (i = 0; i < buffer_size; i++) + { + node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create( + graph, F32, &buffer[i] ); + } + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TYPE] ); + for (i = 0; i < buffer_size; i++) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); + } + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( custom_warp_perspective, _setup ) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c new file mode 100644 index 0000000..1698251 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c @@ -0,0 +1,295 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum _custom_warp_affine_type_e +{ + nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR, + bilinear = VSI_NN_INTERPOLATION_BILINEAR, +}custom_warp_affine_type_e; + +#define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE "custom_warp_affine" + +// Add kernel hashtable here +#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \ + (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20)) +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \ + _CUSTOM_WARP_AFFINE_KERNEL_SOURCE } +#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \ + _CUSTOM_WARP_AFFINE_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _custom_warp_affine_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_KERNEL_MAP( U8, U8, bilinear ), + + PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_2D_KERNEL_MAP( U8, U8, bilinear ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _custom_warp_affine_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def ) +#define SCALAR_MATRIX_OFFSET (2) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + vsi_size_array_t * out_shape = NULL; + float m[6] = {0}; + float matrix0[4] = {0}; + float matrix1[4] = {0}; + float matrix4[4] = {0}; + int32_t i = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + for (i = 0; i < 6; i++) + { + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], + &m[i]); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[2]; matrix0[3] = m[3]; + matrix1[0] = m[4]; matrix1[1] = m[5]; + matrix4[0] = m[0]; matrix4[1] = m[1]; matrix4[2] = m[0] * 2; matrix4[3] = m[1] * 2; + + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_add_param( node, + "matrix0", &matrix0 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "matrix1", &matrix1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "matrix4", &matrix4 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _custom_warp_affine_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t type + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _custom_warp_affine_kernel_map; + size_t kernel_map_size = _cnt_of_array( _custom_warp_affine_kernel_map ); + vx_param_description_t * param_def = _custom_warp_affine_kernel_param_def; + vx_kernel_initialize_f initializer = _custom_warp_affine_initializer; + int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1; + uint32_t key = 0; + uint32_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _custom_warp_affine_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_AFFINE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + size_t i = 0; + size_t buffer_size = 0; + int32_t type = vsi_nn_kernel_param_get_int32( params, "type"); + float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size ); + + if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, type ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + for (i = 0; i < buffer_size; i++) + { + node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create( + graph, F32, &buffer[i] ); + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM ); + for (i = 0; i < buffer_size; i++) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); + } + // Set default border mode. + border.constant_value.U32 = 0xcdcdcdcd; + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( custom_warp_affine, _setup ) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c new file mode 100644 index 0000000..6936759 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c @@ -0,0 +1,300 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum _custom_warp_perspective_type_e +{ + nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR, + bilinear = VSI_NN_INTERPOLATION_BILINEAR, +}custom_warp_perspective_type_e; +#define _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE "custom_warp_perspective" + +// Add kernel hashtable here +#define CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \ + (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20)) +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_perspective_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \ + _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE } +#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_perspective_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \ + _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _custom_warp_perspective_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_KERNEL_MAP( U8, U8, bilinear ), + + PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_2D_KERNEL_MAP( U8, U8, bilinear ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _custom_warp_perspective_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM _cnt_of_array( _custom_warp_perspective_kernel_param_def ) +#define SCALAR_MATRIX_OFFSET (2) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_custom_warp_perspective_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + vsi_size_array_t * out_shape = NULL; + float m[9] = {0}; + float matrix0[4] = {0}; + float matrix1[4] = {0}; + float matrix2[4] = {0}; + float matrix4[4] = {0}; + int32_t i = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + for (i = 0; i < 9; i++) + { + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], + &m[i]); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[3]; matrix0[3] = m[4]; + matrix1[0] = m[6]; matrix1[1] = m[7]; matrix1[2] = m[2]; matrix1[3] = m[5]; + matrix2[0] = m[8]; + matrix4[0] = m[0]; matrix4[1] = m[1]; matrix4[2] = m[0] * 2; matrix4[3] = m[1] * 2; + + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_add_param( node, + "matrix0", &matrix0 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "matrix1", &matrix1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "matrix2", &matrix2 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "matrix4", &matrix4 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _custom_warp_perspective_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t type + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _custom_warp_perspective_kernel_map; + size_t kernel_map_size = _cnt_of_array( _custom_warp_perspective_kernel_map ); + vx_param_description_t * param_def = _custom_warp_perspective_kernel_param_def; + vx_kernel_initialize_f initializer = _custom_warp_perspective_initializer; + int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1; + uint32_t key = 0; + uint32_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = CUSTOM_WARP_PERSPECTIVE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _custom_warp_perspective_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_PERSPECTIVE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + size_t i = 0; + size_t buffer_size = 0; + int32_t type = vsi_nn_kernel_param_get_int32( params, "type"); + float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size ); + + if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, type ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + for (i = 0; i < buffer_size; i++) + { + node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create( + graph, F32, &buffer[i] ); + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM ); + for (i = 0; i < buffer_size; i++) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); + } + // Set default border mode. + border.constant_value.U32 = 0xcdcdcdcd; + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( custom_warp_perspective, _setup ) diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c new file mode 100644 index 0000000..2e7415e --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c @@ -0,0 +1,136 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _ainr_denoise_postprocess_local_data_t { + int32_t placeholder; +} ainr_denoise_postprocess_local_data_t; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + +#if defined(VX_DENOISE_POSTPROCESS_SUPPORT) && VX_DENOISE_POSTPROCESS_SUPPORT + self->n = vxDenoisePostProcesslayer( + self->graph->g, + REQUIRED_IO(inputs[0]), // currInput + REQUIRED_IO(inputs[1]), // nnOutput + REQUIRED_IO(inputs[2]), // preOutImg + REQUIRED_IO(inputs[3]), // S0 + REQUIRED_IO(inputs[4]), // C0 + REQUIRED_IO(inputs[5]), // C1 + REQUIRED_IO(inputs[6]), // C2 + REQUIRED_IO(inputs[7]), // C3 + REQUIRED_IO(inputs[8]), // clampMin + REQUIRED_IO(inputs[9]), // clampMax + REQUIRED_IO(outputs[0]) // output + ); +#else + self->n = NULL; +#endif + + if(NULL == self->n) + { + VSILOGE( "Create vxDenoisePostProcesslayer fail." ); + return VSI_FAILURE; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CUSTOM_AINR_DENOISE_POSTPROCESS, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 10, + /* output_num */ 1 + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c new file mode 100644 index 0000000..e076b7c --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c @@ -0,0 +1,136 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _custom_warp_affine_local_data_t { + int32_t placeholder; +} custom_warp_affine_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_custom_warp_affine_param * p; + p = &(self->nn_param.custom_warp_affine); + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 ); + vsi_nn_kernel_param_add_int32( param, "type", p->type); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "custom_warp_affine", + inputs, 1, + outputs, 1, param ); + + vsi_nn_kernel_param_release( ¶m ); + + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t i = 0; + + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = self->nn_param.custom_warp_affine.size[0]; + outputs[0]->attr.size[1] = self->nn_param.custom_warp_affine.size[1]; + + for (i = 2; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CUSTOM_WARP_AFFINE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c new file mode 100644 index 0000000..7afbd83 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c @@ -0,0 +1,136 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _custom_warp_perspective_local_data_t { + int32_t placeholder; +} custom_warp_perspective_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_custom_warp_affine_param * p; + p = &(self->nn_param.custom_warp_affine); + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 9 ); + vsi_nn_kernel_param_add_int32( param, "type", p->type); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "custom_warp_perspective", + inputs, 1, + outputs, 1, param ); + + vsi_nn_kernel_param_release( ¶m ); + + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t i = 0; + + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = self->nn_param.custom_warp_perspective.size[0]; + outputs[0]->attr.size[1] = self->nn_param.custom_warp_perspective.size[1]; + + for (i = 2; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CUSTOM_WARP_PERSPECTIVE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c index cc62fab..f40c56e 100644 --- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c @@ -64,14 +64,16 @@ typedef struct static const _kernel_map_type _clip_kernel_map[] = { - PACK_KERNEL_MAP(F32, F32), - PACK_KERNEL_MAP(F32, U8), - PACK_KERNEL_MAP(U8, U8), - PACK_KERNEL_MAP(U8, F32), - PACK_KERNEL_MAP_2D(F32, F32), - PACK_KERNEL_MAP_2D(F32, U8), - PACK_KERNEL_MAP_2D(U8, U8), - PACK_KERNEL_MAP_2D(U8, F32), + PACK_KERNEL_MAP(F32, F32), + PACK_KERNEL_MAP(F32, U8), + PACK_KERNEL_MAP(U8, U8), + PACK_KERNEL_MAP(U8, F32), + PACK_KERNEL_MAP(BF16, BF16), + PACK_KERNEL_MAP_2D(F32, F32), + PACK_KERNEL_MAP_2D(F32, U8), + PACK_KERNEL_MAP_2D(U8, U8), + PACK_KERNEL_MAP_2D(U8, F32), + PACK_KERNEL_MAP_2D(BF16, BF16), }; diff --git a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c new file mode 100644 index 0000000..e1bb5f9 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c @@ -0,0 +1,226 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +#define _DEPTH2SPACE_CRD_KERNEL_SOURCE "depth2space_crd" + +// Add kernel hashtable here +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F32TOF32 CVIVANTE_NAMESPACE("cl.depth2space_crd_F32toF32") + +// Add kernel hashtable here +#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _blk_size) \ + ((_input0_type << 24) | (_output_type << 16) | (_blk_size << 8)) + +#define TENSOR_DEPTH2SPACE_CRD_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 0), \ + VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } depth2space_crd_map[] = +{ + TENSOR_DEPTH2SPACE_CRD_KERNELS(F32, F32, _DEPTH2SPACE_CRD_KERNEL_SOURCE) +}; + +/* + * Kernel params + */ +static vx_param_description_t _depth2space_crd_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _DEPTH2SPACE_CRD_PARAM_NUM _cnt_of_array( _depth2space_crd_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t output_dims = 0; + int32_t output_width = 0; + int32_t output_height = 0; + int32_t output_chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + output_dims = (int32_t)attr[0]->shape->size; + output_width = (int32_t)(attr[0]->shape->data[0]); + output_height = (int32_t)(attr[0]->shape->data[1]); + output_chn = (int32_t)(output_dims > 2 ? attr[0]->shape->data[2] : 1); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = output_width; + gpu_param.global_size[1] = output_height; + gpu_param.global_size[2] = output_chn; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _depth2space_crd_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, 0 ); + + for ( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ ) + { + if ( depth2space_crd_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(depth2space_crd_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", depth2space_crd_map[i].function_name ); + kernel->info.parameters = _depth2space_crd_kernel_param_def; + kernel->info.numParams = _DEPTH2SPACE_CRD_PARAM_NUM; + kernel->info.initialize = _depth2space_crd_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + depth2space_crd_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + depth2space_crd_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_DEPTH2SPACE_CRD_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _DEPTH2SPACE_CRD_PARAM_NUM, + inputs, 1, outputs, 1 ); + node_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _DEPTH2SPACE_CRD_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( depth2space_internal, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c index 5572007..ef10ea5 100644 --- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -42,6 +42,7 @@ __BEGIN_DECLS typedef enum { UNARY_SIN, + UNARY_COS, UNARY_EXP, UNARY_LOG, UNARY_ELU, @@ -89,6 +90,7 @@ typedef enum VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() }, #define SIN_OPERATION sin +#define COS_OPERATION cos #define EXP_OPERATION exp #define LOG_OPERATION log #define ELU_OPERATION elu @@ -107,6 +109,8 @@ static const struct { { TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION, UNARY_SIN, F32, F32) TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION, UNARY_SIN, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION, UNARY_COS, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION, UNARY_COS, F16, F16) TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION, UNARY_EXP, F32, F32) TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION, UNARY_EXP, F16, F16) TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION, UNARY_LOG, F32, F32) @@ -128,6 +132,8 @@ static const struct { TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F32, F32) TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION, UNARY_COS, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION, UNARY_COS, F16, F16) TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION, UNARY_EXP, F32, F32) TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION, UNARY_EXP, F16, F16) TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION, UNARY_LOG, F32, F32) @@ -148,6 +154,7 @@ static const struct { TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION, UNARY_HGELU, F16, F16) TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, U8) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8, U8) TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, U8) TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, U8, U8) TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, U8, U8) @@ -159,6 +166,7 @@ static const struct { TENSOR_UNARY_KERNELS(HGELU_OPERATION, UNARY_HGELU, U8, U8) TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8) TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8) TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8) TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, U8, U8) @@ -175,6 +183,7 @@ static const struct { }; #undef SIN_OPERATION +#undef COS_OPERATION #undef EXP_OPERATION #undef LOG_OPERATION #undef ELU_OPERATION @@ -438,6 +447,7 @@ OnError: REGISTER_ELTWISE_UNARY_BACKEND_CL( sin, UNARY_SIN ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( cos, UNARY_COS ) REGISTER_ELTWISE_UNARY_BACKEND_CL( exp, UNARY_EXP ) REGISTER_ELTWISE_UNARY_BACKEND_CL( log, UNARY_LOG ) REGISTER_ELTWISE_UNARY_BACKEND_CL( elu, UNARY_ELU ) diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c index 1f0ba44..af31ed1 100644 --- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c @@ -103,7 +103,6 @@ static vx_param_description_t _floordiv_kernel_param_def[] = #define SCALAR_OUTPUT_SCALE (7) #define SCALAR_OUTPUT_TAIL (8) -#define FLOORDIV_PARAM_NUM 3 #define FLOORDIV_QUANT_PARAM_NUM _cnt_of_array( _floordiv_kernel_param_def ) /* @@ -154,8 +153,6 @@ final: return status; } /* _floordiv_initializer() */ - - /* * Query kernel */ @@ -164,8 +161,7 @@ static vsi_status _query_kernel vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, - vsi_bool image_2d, - vsi_bool *is_use_u8_kernel + vsi_bool image_2d ) { vsi_status status = VSI_FAILURE; @@ -189,7 +185,7 @@ static vsi_status _query_kernel { in0_dtype = F32; } - else if (I16 == in0_dtype) + else if (I16 == in0_dtype || I8 == in0_dtype) { in0_dtype = I32; } @@ -198,7 +194,7 @@ static vsi_status _query_kernel { in1_dtype = F32; } - else if (I16 == in1_dtype) + else if (I16 == in1_dtype || I8 == in1_dtype) { in1_dtype = I32; } @@ -207,16 +203,9 @@ static vsi_status _query_kernel { out_dtype = F32; } - - if ((U8 == in0_dtype) || (U8 == in1_dtype) || (U8 == out_dtype)) + else if (I16 == out_dtype || I8 == out_dtype) { - param_def_size = FLOORDIV_QUANT_PARAM_NUM; - *is_use_u8_kernel = TRUE; - } - else - { - param_def_size = FLOORDIV_PARAM_NUM; - *is_use_u8_kernel = FALSE; + out_dtype = I32; } key = FLOORDIV_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d); @@ -228,7 +217,7 @@ static vsi_status _query_kernel break; } } - if( i < kernel_map_size ) + if ( i < kernel_map_size ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); kernel->info.parameters = param_def; @@ -262,19 +251,18 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; - float outputScale = vsi_nn_get_tensor_scale(outputs[0]); - float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); - float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); - float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); - float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); - float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]); - vsi_bool is_use_u8_kernel = FALSE; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]); outputScale = 1.0f / outputScale; input0Tail = -(input0Tail * input0Scale); input1Tail = -(input1Tail * input1Scale); - if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -282,40 +270,35 @@ static vsi_nn_kernel_node_t _setup image_2d = (outputs[0]->attr.dim_num == 2); - status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel); - if( VSI_SUCCESS == status) + status = _query_kernel( kernel, inputs, outputs, image_2d); + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { - size_t node_params_num = FLOORDIV_PARAM_NUM; + size_t node_params_num = FLOORDIV_QUANT_PARAM_NUM; /* Set inputs and outputs */ vsi_nn_kernel_node_pack_io( node_params, _FLOORDIV_PARAM_NUM, inputs, input_num, outputs, output_num ); - if (is_use_u8_kernel) - { - node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale ); - node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail ); - node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale ); - node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail ); - node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); - node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail ); - node_params_num = FLOORDIV_QUANT_PARAM_NUM; - } + + node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale ); + node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail ); + node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale ); + node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); VSI_ASSERT( status == VSI_SUCCESS ); - if (is_use_u8_kernel) - { - vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); - } + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] ); } } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c index aa5e2e5..fdeda2e 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c @@ -47,7 +47,8 @@ typedef enum INTERNAL_KERNEL_GATHER, } _internal_kernel_e; -#define _GATHER_KERNEL_SOURCE "gather" +#define _GATHER_KERNEL_SOURCE "gather" +#define _GATHER_BATCH_KERNEL_SOURCE "gather_batch" // Add kernel hashtable here #define VX_KERNEL_NAME_GATHER_U8TOU8 CVIVANTE_NAMESPACE("cl.gather_U8toU8") @@ -55,25 +56,39 @@ typedef enum #define VX_KERNEL_NAME_GATHER_I32TOI32 CVIVANTE_NAMESPACE("cl.gather_I32toI32") #define VX_KERNEL_NAME_GATHER_F32TOF32 CVIVANTE_NAMESPACE("cl.gather_F32toF32") +#define VX_KERNEL_NAME_GATHER_BATCH_U8TOU8 CVIVANTE_NAMESPACE("cl.gather_batch_U8toU8") +#define VX_KERNEL_NAME_GATHER_BATCH_F16TOF16 CVIVANTE_NAMESPACE("cl.gather_batch_F16toF16") +#define VX_KERNEL_NAME_GATHER_BATCH_I32TOI32 CVIVANTE_NAMESPACE("cl.gather_batch_I32toI32") +#define VX_KERNEL_NAME_GATHER_BATCH_F32TOF32 CVIVANTE_NAMESPACE("cl.gather_batch_F32toF32") + // Add kernel hashtable here -#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ - ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) +#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d, _batch) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d << 4) | (_batch)) #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ - { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \ VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \ SOURCE }, +#define TENSOR_GATHER_BATCH_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \ + VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + static const struct { uint32_t key; char* function_name; const char* source_name; } gather_map[] = { - TENSOR_GATHER_KERNELS(U8, I32, U8, _GATHER_KERNEL_SOURCE) + TENSOR_GATHER_KERNELS(U8, I32, U8, _GATHER_KERNEL_SOURCE) TENSOR_GATHER_KERNELS(F16, I32, F16, _GATHER_KERNEL_SOURCE) TENSOR_GATHER_KERNELS(I32, I32, I32, _GATHER_KERNEL_SOURCE) TENSOR_GATHER_KERNELS(F32, I32, F32, _GATHER_KERNEL_SOURCE) + TENSOR_GATHER_BATCH_KERNELS(U8, I32, U8, _GATHER_BATCH_KERNEL_SOURCE) + TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16, _GATHER_BATCH_KERNEL_SOURCE) + TENSOR_GATHER_BATCH_KERNELS(I32, I32, I32, _GATHER_BATCH_KERNEL_SOURCE) + TENSOR_GATHER_BATCH_KERNELS(F32, I32, F32, _GATHER_BATCH_KERNEL_SOURCE) }; /* @@ -88,6 +103,7 @@ static vx_param_description_t _gather_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; #define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def ) @@ -97,6 +113,7 @@ static vsi_status cal_gather_tensor_reshape_size vsi_nn_tensor_t ** inputs, vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], uint32_t block_size, + vsi_size_t batch_dims, uint32_t idxFlg ) { @@ -105,30 +122,37 @@ static vsi_status cal_gather_tensor_reshape_size vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; vsi_size_t elementCnt = 1; + vsi_size_t outerCnt = 1; #define VSI_NN_MAX_IMAGE_WIDTH (65536) - for(i = 0; i < dims_num; ++i) + for (i = 0; i < dims_num - batch_dims; ++i) { elementCnt *= input_size[i]; } - for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) + for (; i < dims_num; ++i) + { + outerCnt *= input_size[i]; + } + + for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) { sizes[i] = 1; } - if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH) + if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH) { sizes[0] = elementCnt; - sizes[1] = 1; + sizes[1] = outerCnt; status = VSI_SUCCESS; } else { - if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) { sizes[0] = block_size; sizes[1] = elementCnt / block_size; + sizes[2] = outerCnt; status = VSI_SUCCESS; } } @@ -160,9 +184,9 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) vsi_size_array_t * input1_shape = NULL; int32_t block_size = 0; int32_t block_num = 0; - vsi_ssize_t indices_num = 1; - size_t input_dims1 = 0; - size_t i = 0; + vsi_ssize_t indices_num = 1; + size_t input_dims1 = 0; + size_t i = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -176,7 +200,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) input1_shape = attr[1]->shape; input_dims1 = input1_shape->size; - for (i = 0; i < input_dims1; i++) + for (i = 0; i < input_dims1 - 1; i++) { indices_num *= input1_shape->data[i]; } @@ -214,7 +238,8 @@ static vsi_status _query_kernel ( vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, - vsi_nn_tensor_t * const * const outputs + vsi_nn_tensor_t * const * const outputs, + int32_t is_batch /* Add extra params */ ) { @@ -227,17 +252,17 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 ); + key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0, is_batch ); - for( i = 0; i < _cnt_of_array(gather_map); i ++ ) + for ( i = 0; i < _cnt_of_array(gather_map); i ++ ) { - if( gather_map[i].key == key ) + if ( gather_map[i].key == key ) { break; } } - if( i < _cnt_of_array(gather_map) ) + if ( i < _cnt_of_array(gather_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", gather_map[i].function_name ); kernel->info.parameters = _gather_kernel_param_def; @@ -271,54 +296,69 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_GATHER_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" ); int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" ); + int32_t is_batch = batch_dims > 0 ? 1 : 0; + vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3; + int32_t i = 0; - status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0); - status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1); - status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0); - if(status != VSI_SUCCESS) + status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0); + status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1); + status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0); + if (status != VSI_SUCCESS) { return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], rs_dim ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], shapes[1], 2 ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[2], rs_dim ); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } - status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); - if( VSI_SUCCESS == status) + status = _query_kernel( kernel, inputs, outputs, is_batch ); + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { - uint32_t index = 0; -#define RESHAPE_DIM 2 + uint32_t index = 3; + int32_t batch = (int32_t)shapes[1][1]; + /* Pass parameters to node. */ - node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], RESHAPE_DIM ); - node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], RESHAPE_DIM ); - node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM ); -#undef RESHAPE_DIM + vsi_nn_kernel_node_pack_io( node_params, _GATHER_PARAM_NUM, + reshape_tensors, 2, &reshape_tensors[2], 1 ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &indices_num ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_PARAM_NUM ); - CHECK_STATUS(status); - vsi_nn_kernel_tensor_release( &node_params[0] ); - vsi_nn_kernel_tensor_release( &node_params[1] ); - vsi_nn_kernel_tensor_release( &node_params[2] ); vsi_nn_kernel_scalar_release( &node_params[3] ); vsi_nn_kernel_scalar_release( &node_params[4] ); vsi_nn_kernel_scalar_release( &node_params[5] ); vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); } } + + for (i = 0; i < 3; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c index 49ccd23..95a4bff 100644 --- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -445,45 +444,6 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ -static int32_t _optimize_gn_shape_cl - ( - vsi_nn_tensor_t ** inputs, - vsi_size_t group_size, - int32_t group_num, - vsi_size_t* opt_shape, - int32_t* is2D_flg - ) -{ - vsi_status status = VSI_SUCCESS; - vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; - vsi_size_t new_rank = 0; - group_shape[0] = inputs[0]->attr.size[0]; - group_shape[1] = inputs[0]->attr.size[1]; - group_shape[2] = group_size; - - vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank ); - - if (opt_shape[1] == 1) - { - opt_shape[1] = group_num; - opt_shape[2] = 1; - opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - is2D_flg[0] = 1; - } - else if (new_rank == 2) - { - opt_shape[2] = group_num; - opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - } - else - { - status = VSI_FAILURE; - } - - return status; -} - - static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, @@ -535,11 +495,13 @@ static vsi_nn_kernel_node_t _setup return NULL; } - status = _optimize_gn_shape_cl(inputs, group_size, group_num, new_shape, &is2D_flg); + status = vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num, group_num, 0, new_shape); if ( VSI_SUCCESS != status ) { goto final; } + is2D_flg = (new_shape[2] == 1) && ((int32_t)new_shape[1] == group_num); rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4); rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4); diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c index 929c812..58eb2ee 100644 --- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c @@ -406,12 +406,12 @@ static vsi_nn_kernel_node_t _setup uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; uint32_t hashkey = 0; int32_t i = 0; - + uint32_t rank = outputs[0]->attr.dim_num; float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); - int32_t reshape_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" ); - size_t width = inputs[0]->attr.size[0]; size_t height = inputs[0]->attr.size[1]; + int32_t reshape_flg = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH + && rank > 2; int32_t group_num = (int32_t)(width + 15) / 16; int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]); float input_scale = vsi_nn_get_tensor_scale(inputs[0]); diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c index ed420ad..e5bae71 100644 --- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c @@ -101,18 +101,23 @@ static const _kernel_map_type moments_map[] = TENSOR_MOMENTS_KERNELS(U8, F32, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS(F32, F32, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS(I32, F32, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(BF16,F32, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS(U8, F32, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS(F32, F32, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS(I32, F32, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(BF16,F32, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS(U8, F32, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(F32, F32, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(I32, F32, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(BF16,F32, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F32, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(BF16,F32, 0, 1, 2, KERNEL_SOURCE_5) }; /* diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c new file mode 100644 index 0000000..ad99bc6 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c @@ -0,0 +1,301 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _TOPK_KERNEL_SOURCE "topk" +#define STR(a) #a +// Add kernel hashtable here +#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \ + ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) ) +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \ + { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \ + CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \ + _TOPK_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _topk_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, 0 ), + PACK_KERNEL_MAP( F32, F32, 1 ), + PACK_KERNEL_MAP( F32, F32, 2 ), + PACK_KERNEL_MAP( F32, F32, 3 ), + PACK_KERNEL_MAP( F32, F32, 4 ), + PACK_KERNEL_MAP( F32, F32, 5 ), + PACK_KERNEL_MAP( F32, F32, 6 ), + + PACK_KERNEL_MAP( U32, U32, 0 ), + PACK_KERNEL_MAP( U32, U32, 1 ), + PACK_KERNEL_MAP( U32, U32, 2 ), + PACK_KERNEL_MAP( U32, U32, 3 ), + PACK_KERNEL_MAP( U32, U32, 4 ), + PACK_KERNEL_MAP( U32, U32, 5 ), + PACK_KERNEL_MAP( U32, U32, 6 ), + + PACK_KERNEL_MAP( I32, I32, 0 ), + PACK_KERNEL_MAP( I32, I32, 1 ), + PACK_KERNEL_MAP( I32, I32, 2 ), + PACK_KERNEL_MAP( I32, I32, 3 ), + PACK_KERNEL_MAP( I32, I32, 4 ), + PACK_KERNEL_MAP( I32, I32, 5 ), + PACK_KERNEL_MAP( I32, I32, 6 ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _topk_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _TOPK_PARAM_NUM _cnt_of_array( _topk_kernel_param_def ) +#define SCALAR_INPUT_NUM_STAGES (3) +#define SCALAR_INPUT_WIDTH (4) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_topk_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_size_array_t * in_shape = NULL; + int32_t num_stages = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_NUM_STAGES], &num_stages); + CHECK_STATUS_FAIL_GOTO(status, final ); + + in_shape = input_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.local_size[0] = (size_t)(1 << num_stages); + gpu_param.local_size[1] = 1; + gpu_param.global_size[0] = (size_t)(1 << num_stages); + gpu_param.global_size[1] = in_shape->data[1]; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(input_attr); + return status; +} /* _topk_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t num_stages + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _topk_kernel_map; + size_t kernel_map_size = _cnt_of_array( _topk_kernel_map ); + vx_param_description_t * param_def = _topk_kernel_param_def; + vx_kernel_initialize_f initializer = _topk_initializer; +#define _PACK_SELECT_KEY( in_type, out_type ) \ + ( (in_type) | (out_type << 8) ) + uint32_t key = 0; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + switch (_PACK_SELECT_KEY(in_dtype, out_dtype)) + { + case _PACK_SELECT_KEY(F32, F32): + case _PACK_SELECT_KEY(F16, F16): + key = TOPK_HASH_KEY( F32, F32, num_stages ); + break; + case _PACK_SELECT_KEY(U32, U32): + case _PACK_SELECT_KEY(U16, U16): + case _PACK_SELECT_KEY(U8, U8): + key = TOPK_HASH_KEY( U32, U32, num_stages ); + break; + case _PACK_SELECT_KEY(I32, I32): + case _PACK_SELECT_KEY(I16, I16): + case _PACK_SELECT_KEY(I8, I8): + key = TOPK_HASH_KEY( I32, I32, num_stages ); + break; + default: + break; + } +#undef _PACK_SELECT_KEY + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _topk_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_size_t block_size = inputs[0]->attr.size[0]; + vsi_size_t block_num = 1; + uint32_t i = 0; + vsi_nn_tensor_t* rs_tensors[3] = { NULL }; + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + int32_t width = (int32_t)block_size; + int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k"); + int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f)); + + for (i = 1; i < inputs[0]->attr.dim_num; i ++) + { + block_num = block_num * inputs[0]->attr.size[i]; + } + + if( vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE || + outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_INT32 ) + { + return NULL; + } + + shape[0][0] = block_size; + shape[0][1] = block_num; + shape[1][0] = top_k; + shape[1][1] = block_num; + + rs_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shape[0], 2 ); + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shape[1], 2 ); + rs_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[1], shape[1], 2 ); + + status = _query_kernel( kernel, inputs, outputs, num_stages ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM, + rs_tensors, input_num, &rs_tensors[1], output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create( + graph, I32, &num_stages ); + node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( + graph, I32, &width ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } +final: + vsi_safe_release_tensor(rs_tensors[0]); + vsi_safe_release_tensor(rs_tensors[1]); + vsi_safe_release_tensor(rs_tensors[2]); + if (node_params[SCALAR_INPUT_NUM_STAGES]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] ); + } + if (node_params[SCALAR_INPUT_WIDTH]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] ); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( topk, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c index 3aa63e2..17b7be6 100644 --- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c @@ -40,6 +40,7 @@ __BEGIN_DECLS typedef enum { UNARY_SIN, + UNARY_COS, UNARY_EXP, UNARY_LOG, UNARY_ELU, @@ -69,6 +70,11 @@ static float sin_eval(float data) return sinf(data); } +static float cos_eval(float data) +{ + return cosf(data); +} + static float log_eval(float data) { return logf(data); @@ -212,6 +218,9 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) case UNARY_SIN: data = sin_eval(data); break; + case UNARY_COS: + data = cos_eval(data); + break; case UNARY_EXP: data = exp_eval(data); break; @@ -372,6 +381,7 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_ELTWISE_UNARY_BACKEND_CPU( sin, UNARY_SIN ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( cos, UNARY_COS ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( exp, UNARY_EXP ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( log, UNARY_LOG ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu, UNARY_ELU ) diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c index 2ea12a5..b91dabd 100644 --- a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c @@ -42,7 +42,7 @@ __BEGIN_DECLS /* * Define kernel meta. */ -#define _CPU_ARG_NUM (3) +#define _CPU_ARG_NUM (4) #define _CPU_INPUT_NUM (2) #define _CPU_OUTPUT_NUM (1) #define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) @@ -62,9 +62,9 @@ DEF_KERNEL_EXECUTOR(_gather_exec) uint32_t* buffer_idx = NULL; size_t in_elements = 0, out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; - vsi_size_t i = 0, j = 0; - int32_t block_size = 1, block_num = 1, axis_num = 0; - vsi_size_t indices_num = 1; + vsi_size_t i = 0, j = 0, b = 0; + int32_t block_size = 1, block_num = 1, axis_num = 0, batch_dims = 0; + vsi_size_t indices_num = 1, batch = 1, in_stride = 1, out_stride = 1; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; @@ -86,6 +86,8 @@ DEF_KERNEL_EXECUTOR(_gather_exec) CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis_num); CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &batch_dims); + CHECK_STATUS_FAIL_GOTO(status, final ); buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); @@ -98,26 +100,44 @@ DEF_KERNEL_EXECUTOR(_gather_exec) memset( buffer[1], 0, out_elements * sizeof(float) ); { - for(i = 0; i < attr[1]->shape->size; ++i) + for (i = 0; i < attr[1]->shape->size - (vsi_size_t)batch_dims; i++) { indices_num *= attr[1]->shape->data[i]; } - for(i = 0; i < (vsi_size_t)block_num; i++) + for (; i < attr[1]->shape->size; i++) { - for(j = 0; j < indices_num; j++) + batch *= attr[1]->shape->data[i]; + } + + for (i = 0; i < attr[0]->shape->size - (vsi_size_t)batch_dims; i++) + { + in_stride *= attr[0]->shape->data[i]; + } + + for (i = 0; i < attr[2]->shape->size - (vsi_size_t)batch_dims; i++) + { + out_stride *= attr[2]->shape->data[i]; + } + + for (b = 0; b < batch; b++) + { + for (i = 0; i < (vsi_size_t)block_num; i++) { - uint32_t indice = buffer_idx[j]; - vsi_size_t in_index = (i * axis_num + indice) * block_size; - if(in_index < in_elements) + for (j = 0; j < indices_num; j++) { - vsi_size_t out_index = (i * indices_num + j) * block_size; - memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float)); - } - else - { - status = VX_FAILURE; - CHECK_STATUS_FAIL_GOTO( status, final ); + uint32_t indice = buffer_idx[j + indices_num * b]; + vsi_size_t in_index = (i * axis_num + indice) * block_size + b * in_stride; + if (in_index < in_elements) + { + vsi_size_t out_index = (i * indices_num + j) * block_size + b * out_stride; + memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float)); + } + else + { + status = VX_FAILURE; + CHECK_STATUS_FAIL_GOTO( status, final ); + } } } } @@ -128,20 +148,20 @@ DEF_KERNEL_EXECUTOR(_gather_exec) CHECK_STATUS_FAIL_GOTO( status, final ); final: - if( buffer_idx ) + if ( buffer_idx ) { free( buffer_idx ); } - for( i = 0; i < 2; i ++ ) + for ( i = 0; i < 2; i ++ ) { - if( buffer[i] ) + if ( buffer[i] ) { free( buffer[i] ); } } - for( i = 0; i < _CPU_IO_NUM; i ++ ) + for ( i = 0; i < _CPU_IO_NUM; i ++ ) { - if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } } return status; } /* _gather_exec() */ @@ -156,6 +176,7 @@ static vx_param_description_t _gather_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; #define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def ) @@ -201,15 +222,16 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; status = _query_kernel( inputs, outputs, kernel ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 3; - int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); - int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); + int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); + int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" ); /* Set inputs and outputs */ vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, @@ -218,12 +240,14 @@ static vsi_nn_kernel_node_t _setup backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num ); backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch_dims ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); CHECK_STATUS( status ); vsi_nn_kernel_scalar_release( &backend_params[3] ); vsi_nn_kernel_scalar_release( &backend_params[4] ); vsi_nn_kernel_scalar_release( &backend_params[5] ); + vsi_nn_kernel_scalar_release( &backend_params[6] ); } else { diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c index 33e8b33..e446623 100644 --- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c @@ -103,9 +103,10 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec) if(coord_stride <= 4) // reshape 3D { vsi_ssize_t stride[4] = {block_size, 0, 0, 0}; + int32_t start_dim = (int32_t)attr[0]->shape->size - coord_stride; for(i = 1; i < coord_stride; ++i) { - stride[i] = stride[i - 1] * attr[0]->shape->data[i]; + stride[i] = stride[i - 1] * attr[0]->shape->data[start_dim + i - 1]; } for(i = 0; i < indices_num; i++) @@ -118,8 +119,8 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec) for(j = 0; j < coord_stride; j++) { coord[j] = buffer_idx[i * coord_stride + j]; + in_index += coord[j] * stride[j]; } - in_index = coord[3] * stride[3] + coord[2] * stride[2] + coord[1] * stride[1] + coord[0] * stride[0]; memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float)); } } diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c index 2744643..cf9bb0e 100644 --- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c @@ -61,7 +61,13 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec) float * buffer[_CPU_IO_NUM] = { NULL }; size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; - uint32_t i = 0; + vsi_size_t batch = 1; + vsi_size_t depth = 1; + vsi_size_t norm_size = 1; + vsi_size_t b = 0; + vsi_size_t c = 0; + vsi_size_t i = 0; + size_t rank = 1; float eps = .0f; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; @@ -96,62 +102,55 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec) CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final ); memset( buffer[3], 0, out_elements * sizeof(float) ); + rank = attr[0]->shape->size; + + batch = attr[0]->shape->data[rank - 1]; + depth = attr[0]->shape->data[rank - 2]; + + for ( i = 0; i < (vsi_size_t)rank - 2; i++) { - vsi_size_t b = 0, c = 0, h = 0, w = 0; - vsi_size_t height = attr[0]->shape->data[1]; - vsi_size_t width = attr[0]->shape->data[0]; - vsi_size_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; - vsi_size_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + norm_size *= attr[0]->shape->data[i]; + } - for (b = 0; b < bh; b++) + for (b = 0; b < batch; b++) + { + for (c = 0; c < depth; c++) { - for (c = 0; c < ch; c++) + vsi_size_t page = c * norm_size + b * norm_size * depth; + float sum = .0f; + float sumsq = .0f; + float mean = .0f; + float vari = .0f; + float data = 0; + float scaleVal = buffer[2][c]; + float biasVal = buffer[1][c]; + + for (i = 0; i < norm_size; i++) { - vsi_size_t page = c * (height * width) + b * (height * width * ch); - float sum = .0f; - float sumsq = .0f; - float mean = .0f; - float vari = .0f; - float data = 0; - float scaleVal = buffer[2][c]; - float biasVal = buffer[1][c]; + vsi_size_t index = page + i; + sum += buffer[0][index]; + } - for (h = 0; h < height; h++) - { - vsi_size_t len = page + h * width; + mean = sum / (float)norm_size; - for (w = 0; w < width; w++) - { - vsi_size_t index = len + w; - sum += buffer[0][index]; - } - } - mean = sum / (width * height); - for (h = 0; h < height; h++) - { - vsi_size_t len = page + h * width; - for (w = 0; w < width; w++) - { - vsi_size_t index = len + w; - data = buffer[0][index] - mean; - sumsq += data * data; - } - } - vari = sumsq / (width * height); - vari = (float)(1.0 / sqrtf(vari + eps)); - for (h = 0; h < height; h++) - { - vsi_size_t len = page + h * width; - for (w = 0; w < width; w++) - { - float normVal = 0; - vsi_size_t index = len + w; - data = buffer[0][index] - mean; + for (i = 0; i < norm_size; i++) + { + vsi_size_t index = page + i; + data = buffer[0][index] - mean; + sumsq += data * data; + } - normVal = data * vari * scaleVal + biasVal; - buffer[3][index] = normVal; - } - } + vari = sumsq / (float)norm_size; + vari = (float)(1.0 / sqrtf(vari + eps)); + + for (i = 0; i < norm_size; i++) + { + float normVal = 0; + vsi_size_t index = page + i; + data = buffer[0][index] - mean; + + normVal = data * vari * scaleVal + biasVal; + buffer[3][index] = normVal; } } } @@ -256,4 +255,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CPU( instance_norm, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c index f133568..611bbfa 100644 --- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c @@ -104,7 +104,6 @@ DEF_KERNEL_EXECUTOR(_compute) in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); - } for(i = 0; i < _OUTPUT_NUM; i ++) { @@ -311,4 +310,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CPU( resize_bilinear, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c index a8cec94..3fe4185 100644 --- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c @@ -63,6 +63,11 @@ __BEGIN_DECLS CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_F16to"#OUT_DTYPE"_2D"), \ HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, +#define HASH_ARGMAX_KERNELS_MIX_OPT( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 2), \ + CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_opt"), \ + HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) }, + static const struct { uint32_t key; char* function_name; @@ -132,6 +137,8 @@ static const struct { HASH_ARGMAX_KERNELS_2D(2, U8, I16) HASH_ARGMAX_KERNELS_2D(2, I16, U8) HASH_ARGMAX_KERNELS_2D(2, I16, I16) + HASH_ARGMAX_KERNELS_MIX_OPT(2, U8, I16) + HASH_ARGMAX_KERNELS_MIX_OPT(2, I8, I16) }; static vx_param_description_t kernel_param_def[] = @@ -228,7 +235,18 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer) if (attr[0]->dtype == I8 || attr[0]->dtype == U8) { - if ( attr[1]->dtype == I8 || + if (axis == 2 && + input_shape->data[2] > 1 && + ((attr[1]->dtype == I8 || attr[1]->dtype == U8) + || (attr[1]->dtype == I16 && input_shape->data[2] < 256))) + { + uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16) + | ((argLenSub1 & 0xFF) << 8) | (argLenSub1 & 0xFF); + packedArgIdx[0] = packedArgIdx[1] = pack; + packedArgIdx[2] = packedArgIdx[3] = pack; + gpu_param.global_scale[0] = 16; + } + else if ( attr[1]->dtype == I8 || attr[1]->dtype == U8) { uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16) @@ -302,7 +320,6 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer) } break; case 1: - case 2: { gpu_dp_inst_t uniExtractData_2x8 = {{ 0x11111111, // TCfg @@ -324,6 +341,52 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } break; + case 2: + { + gpu_dp_inst_t uniExtractData_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtract1stU8toI16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtract2ndU8toI16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtractData_2x8", &uniExtractData_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract1stU8toI16_2x8", &uniExtract1stU8toI16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract2ndU8toI16_2x8", &uniExtract2ndU8toI16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "argLenSub1", &argLenSub1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "packedArgIdx", packedArgIdx ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; default: break; } @@ -354,6 +417,16 @@ static vsi_status _query_kernel input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if ((input_dtype == I8 || input_dtype == U8) + && output_dtype == I16 + && axis == 2 + && inputs[0]->attr.size[2] < 256 + && image_2d == 0) + { + image_2d = 2; + } + key = HASH_ARGMAX_HASH_KEY( axis, input_dtype, output_dtype, image_2d ); for( i = 0; i < _cnt_of_array(_argmax_evis_kernel_map); i ++ ) diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c index ee5a622..dbbe2ad 100644 --- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c @@ -85,12 +85,12 @@ typedef enum #define COMPARISONS_KERNELS_HALF(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \ - HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, F16, F16), \ + HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, BF16, BF16), \ SOURCE }, #define COMPARISONS_KERNELS_HALF_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \ { HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \ - HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, F16, F16), \ + HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, BF16, BF16), \ SOURCE }, #define LESS_OP less @@ -396,6 +396,26 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer) 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; status = vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); @@ -403,6 +423,10 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer) "uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 ); status |= vsi_nn_kernel_gpu_add_param( node, "uniDatatoFp32Part1_4x4", &uniDatatoFp32Part1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, "input0Scale", &input0Scale ); status |= vsi_nn_kernel_gpu_add_param( node, @@ -453,7 +477,7 @@ static vsi_status _query_kernel int i; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); - input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); output_dtype = output_dtype == I8 ? BOOL8 : output_dtype; key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d ); diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c index 732f949..de5aa83 100644 --- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c @@ -301,6 +301,7 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) case _PACK_SELECT_KEY( I8, I8): case _PACK_SELECT_KEY( I16, I16): case _PACK_SELECT_KEY( F16, F16): + case _PACK_SELECT_KEY( BF16, BF16): { gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift); multAndoutZP0[0] = (uint32_t)(M0); @@ -367,6 +368,16 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (input0_dtype == BF16) + { + input0_dtype = F16; + } + + if (output_dtype == BF16) + { + output_dtype = F16; + } + key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, blk_flg ); for( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ ) diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c index 1b99cb1..1e15e71 100644 --- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -42,6 +42,7 @@ __BEGIN_DECLS typedef enum { UNARY_SIN, + UNARY_COS, UNARY_EXP, UNARY_LOG, UNARY_ELU, @@ -79,6 +80,7 @@ typedef enum SOURCE }, #define SIN_OPERATION sin +#define COS_OPERATION cos #define EXP_OPERATION exp #define LOG_OPERATION log #define ELU_OPERATION elu @@ -106,6 +108,17 @@ static const struct { TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8, I8 , KERNEL_SOURCE_3D) TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8, F16 , KERNEL_SOURCE_3D) TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_3D) TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, F16 , KERNEL_SOURCE_3D) TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, I16 , KERNEL_SOURCE_3D) TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, U8 , KERNEL_SOURCE_3D) @@ -162,6 +175,17 @@ static const struct { TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, I8 , KERNEL_SOURCE_2D) TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, F16 , KERNEL_SOURCE_2D) TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_2D) TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, F16 , KERNEL_SOURCE_2D) TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I16 , KERNEL_SOURCE_2D) TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, U8 , KERNEL_SOURCE_2D) @@ -317,6 +341,7 @@ static const struct { }; #undef SIN_OPERATION +#undef COS_OPERATION #undef EXP_OPERATION #undef LOG_OPERATION #undef ELU_OPERATION @@ -443,6 +468,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) switch( pack_key ) { case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_ELU, BF16, BF16 ): @@ -736,6 +762,7 @@ OnError: REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU ) diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index e5b12f7..3be3996 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -64,6 +64,28 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0") #define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_batch_U8toU8") +#define VX_KERNEL_NAME_GATHER_BATCH_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_batch_I8toI8") +#define VX_KERNEL_NAME_GATHER_BATCH_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_batch_I16toI16") +#define VX_KERNEL_NAME_GATHER_BATCH_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_F16toF16") +#define VX_KERNEL_NAME_GATHER_BATCH_I8TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_I8toF16") +#define VX_KERNEL_NAME_GATHER_BATCH_I16TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_I16toF16") +#define VX_KERNEL_NAME_GATHER_BATCH_F16TOI8 CVIVANTE_NAMESPACE("evis.gather_batch_F16toI8") +#define VX_KERNEL_NAME_GATHER_BATCH_F16TOI16 CVIVANTE_NAMESPACE("evis.gather_batch_F16toI16") +#define VX_KERNEL_NAME_GATHER_BATCH_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_U8toF16") +#define VX_KERNEL_NAME_GATHER_BATCH_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_batch_F16toU8") + +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_batch_U8toU8_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_batch_I8toI8_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_batch_I16toI16_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_F16toF16_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I8TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_I8toF16_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I16TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_I16toF16_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOI8 CVIVANTE_NAMESPACE("evis.gather_batch_F16toI8_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOI16 CVIVANTE_NAMESPACE("evis.gather_batch_F16toI16_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_U8toF16_axis0") +#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_batch_F16toU8_axis0") + #define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_array") #define VX_KERNEL_NAME_GATHER_ARRAY_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_array") #define VX_KERNEL_NAME_GATHER_ARRAY_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_array") @@ -77,31 +99,43 @@ __BEGIN_DECLS #define KERNEL_SOURCE_1 "gather" #define KERNEL_SOURCE_2 "gather_mix" #define KERNEL_SOURCE_3 "gather_array" +#define KERNEL_SOURCE_4 "gather_batch" +#define KERNEL_SOURCE_5 "gather_mix_batch" // Add kernel hashtable here -#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0, _is_max) \ - ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0 << 4) | (_is_max)) +#define HASH_GATHER_KEY(_in0_type, _in1_type, _out_type, _axis0, _max, _batch) \ + ((_in0_type << 24) | (_in1_type << 16) | (_out_type << 8) | (_axis0 << 6) | (_max << 4) | (_batch)) #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ - { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0, 0), \ VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \ SOURCE }, #define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ - { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0, 0), \ VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \ SOURCE }, #define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ - { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1, 0), \ VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \ SOURCE }, #define TENSOR_GATHER_AXIS0_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ - { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1), \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1, 0), \ VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \ SOURCE }, +#define TENSOR_GATHER_BATCH_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0, 1), \ + VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +#define TENSOR_GATHER_BATCH_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0, 1), \ + VX_KERNEL_NAME_GATHER_BATCH_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + static const struct { uint32_t key; char* function_name; @@ -136,6 +170,26 @@ static const struct { TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I8, I32, I8, KERNEL_SOURCE_3) TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I16, I32, I16, KERNEL_SOURCE_3) TENSOR_GATHER_AXIS0_ARRAY_KERNELS(F16, I32, F16, KERNEL_SOURCE_3) + TENSOR_GATHER_BATCH_KERNELS(U8, I32, U8, KERNEL_SOURCE_4) + TENSOR_GATHER_BATCH_KERNELS(I8, I32, I8, KERNEL_SOURCE_4) + TENSOR_GATHER_BATCH_KERNELS(I16, I32, I16, KERNEL_SOURCE_4) + TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16, KERNEL_SOURCE_4) + TENSOR_GATHER_BATCH_KERNELS(I8, I32, F16, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_KERNELS(I16, I32, F16, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_KERNELS(F16, I32, I8, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_KERNELS(F16, I32, I16, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_KERNELS(U8, I32, F16, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_KERNELS(F16, I32, U8, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(U8, I32, U8, KERNEL_SOURCE_4) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(I8, I32, I8, KERNEL_SOURCE_4) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(I16, I32, I16, KERNEL_SOURCE_4) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, F16, KERNEL_SOURCE_4) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(I8, I32, F16, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(I16, I32, F16, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, I8, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, I16, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(U8, I32, F16, KERNEL_SOURCE_5) + TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, U8, KERNEL_SOURCE_5) }; /* @@ -158,6 +212,7 @@ static vsi_status get_gather_tensor_reshape_size vsi_nn_tensor_t ** inputs, vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], vsi_size_t block_size, + vsi_size_t batch_dims, uint32_t idxFlg, int32_t* arrayFlg ) @@ -167,13 +222,19 @@ static vsi_status get_gather_tensor_reshape_size vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; vsi_size_t elementCnt = 1; + vsi_size_t outerCnt = 1; #define VSI_NN_MAX_IMAGE_WIDTH (65536) - for(i = 0; i < dims_num; ++i) + for(i = 0; i < dims_num - batch_dims; ++i) { elementCnt *= input_size[i]; } + for(; i < dims_num; ++i) + { + outerCnt *= input_size[i]; + } + for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) { sizes[i] = 1; @@ -182,13 +243,14 @@ static vsi_status get_gather_tensor_reshape_size if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH) { sizes[0] = elementCnt; - sizes[1] = 1; + sizes[1] = outerCnt; status = VSI_SUCCESS; } else { sizes[0] = block_size; sizes[1] = elementCnt / block_size; + sizes[2] = outerCnt; if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH) { arrayFlg[0] = 1; @@ -222,6 +284,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) int32_t block_num = 0; int32_t indices_num = 1; uint32_t input_dims1 = 0; + int32_t batch = 1; vx_uint32 i = 0; vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; vsi_size_array_t * input1_shape = NULL; @@ -283,7 +346,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) input1_shape = attr[1]->shape; input_dims1 = (uint32_t)input1_shape->size; - for (i = 0; i < input_dims1; i++) + for (i = 0; i < input_dims1 - 1; i++) { indices_num *= (int32_t)(input1_shape->data[i]); } @@ -376,6 +439,11 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) #undef _PACK_SELECT_KEY status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num); + if (attr[2]->shape->size > 2) + { + batch = (int32_t)attr[2]->shape->data[2]; + status = vsi_nn_kernel_gpu_add_param(node, "batch", &batch); + } CHECK_STATUS_FAIL_GOTO(status, OnError ); OnError: @@ -415,6 +483,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) int32_t block_num = 0; int32_t indices_num = 1; + int32_t batch = 1; uint32_t input_dims1 = 0; vx_uint32 i = 0; vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; @@ -475,10 +544,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) input1_shape = attr[1]->shape; input_dims1 = (uint32_t)input1_shape->size; - for (i = 0; i < input_dims1; i++) + for (i = 0; i < input_dims1 - 1; i++) { indices_num *= (int32_t)(input1_shape->data[i]); } + batch = (int32_t)(input1_shape->data[input_dims1 - 1]); shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; @@ -486,7 +556,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) shaderParam.global_size[0] = gpu_align_p2((indices_num + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0], 4); shaderParam.global_size[1] = block_num; - shaderParam.global_size[2] = 1; + shaderParam.global_size[2] = batch; status = vsi_nn_kernel_gpu_config( node, &shaderParam ); CHECK_STATUS_FAIL_GOTO(status, OnError); @@ -585,6 +655,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) #undef _PACK_SELECT_KEY status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num); + if (attr[2]->shape->size > 2) + { + status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch); + } CHECK_STATUS_FAIL_GOTO(status, OnError ); OnError: @@ -617,7 +691,8 @@ static vsi_status _query_kernel vsi_nn_kernel_t* kernel, const vsi_nn_kernel_param_t * params, int32_t axis, - int32_t is_array + int32_t is_array, + int32_t is_batch ) { vsi_status status = VSI_FAILURE; @@ -638,7 +713,7 @@ static vsi_status _query_kernel output_dtype = F16; } - key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array); + key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array, is_batch); for( i = 0; i < _cnt_of_array(gather_map); i ++ ) { @@ -688,25 +763,30 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" ); int32_t axis0_flg = 0; int32_t is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0; + int32_t is_batch = batch_dims > 0 ? 1 : 0; + vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3; + int32_t i = 0; if (axis == 0) { - status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0, &is_array); - status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array); - status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0, &is_array); + status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array); + status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array); + status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], batch_dims, 0, &is_array); axis0_flg = 1; } else { - status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0, &is_array); - status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array); - status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &is_array); + status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array); + status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array); + status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array); axis0_flg = 0; } #undef VSI_NN_MAX_BLOCK_SIZE @@ -715,38 +795,45 @@ static vsi_nn_kernel_node_t _setup return NULL; } + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], rs_dim ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], shapes[1], 2 ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[2], rs_dim ); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } - status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array); + status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array, is_batch); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); if ( node ) { - uint32_t index = 0; -#define RESHAPE_DIM 2 + uint32_t index = 3; + /* Pass parameters to node. */ - tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], RESHAPE_DIM ); - tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], RESHAPE_DIM ); - tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM ); -#undef RESHAPE_DIM + vsi_nn_kernel_node_pack_io( tmp_params, _GATHER_PARAM_NUM, + reshape_tensors, 2, &reshape_tensors[2], 1 ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM ); - CHECK_STATUS(status); - vsi_nn_kernel_tensor_release( &tmp_params[0] ); - vsi_nn_kernel_tensor_release( &tmp_params[1] ); - vsi_nn_kernel_tensor_release( &tmp_params[2] ); vsi_nn_kernel_scalar_release( &tmp_params[3] ); vsi_nn_kernel_scalar_release( &tmp_params[4] ); vsi_nn_kernel_scalar_release( &tmp_params[5] ); } } + + for (i = 0; i < 3; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c index 2894f11..9693c29 100644 --- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -994,44 +993,6 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ -static int32_t _optimize_gn_shape - ( - vsi_nn_tensor_t ** inputs, - vsi_size_t group_size, - int32_t group_num, - vsi_size_t* opt_shape, - int32_t* is2D_flg - ) -{ - vsi_status status = VSI_SUCCESS; - vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; - vsi_size_t new_rank = 0; - group_shape[0] = inputs[0]->attr.size[0]; - group_shape[1] = inputs[0]->attr.size[1]; - group_shape[2] = group_size; - - vsi_nn_kernel_optimize_element_shape( group_shape, 3, opt_shape, &new_rank ); - - if (opt_shape[1] == 1) - { - opt_shape[1] = group_num; - opt_shape[2] = 1; - opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - is2D_flg[0] = 1; - } - else if (new_rank == 2) - { - opt_shape[2] = group_num; - opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - } - else - { - status = VSI_FAILURE; - } - - return status; -} - static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, @@ -1077,11 +1038,13 @@ static vsi_nn_kernel_node_t _setup return NULL; } - status = _optimize_gn_shape(inputs, group_size, group_num, new_shape, &is2D_flg); + status = vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size, + inputs[0]->attr.dim_num, group_num, 0, new_shape); if ( VSI_SUCCESS != status ) { goto final; } + is2D_flg = (new_shape[2] == 1) && ((int32_t)new_shape[1] == group_num); rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4); rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4); diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c index 9ddc0bf..4f3367e 100644 --- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c @@ -1004,12 +1004,15 @@ static vsi_nn_kernel_node_t _setup uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; uint32_t hashkey = 0; int32_t i = 0; + uint32_t rank = outputs[0]->attr.dim_num; float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); - int32_t reshape_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" ); + int32_t reshape_flg = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH + && rank > 2; // Check if gpu can support the size if ( !vsi_nn_kernel_gpu_check_shape( - outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + outputs[0]->attr.size, outputs[0]->attr.dim_num ) || + rank > 4 ) { return NULL; } diff --git a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c index ed9561c..6a323c0 100644 --- a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c @@ -76,9 +76,15 @@ static const _kernel_map_type _logical_ops_kernel_map[] = PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR, I8, I8, "or"), PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, I8, I8, "and"), PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, I8, I8, "xor"), + PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR, BF16, I8, "or"), + PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, BF16, I8, "and"), + PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, BF16, I8, "xor"), PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR, I8, I8, "or"), PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, I8, I8, "and"), PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, I8, I8, "xor"), + PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR, BF16, I8, "or"), + PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, BF16, I8, "and"), + PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, BF16, I8, "xor"), }; @@ -159,6 +165,22 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer) status = vsi_nn_kernel_gpu_add_param( node, "uniMulShortMinus1toFp16_2x8", &uniMulShortMinus1toFp16_2x8); CHECK_STATUS_FAIL_GOTO(status, final ); } + else if (BF16 == input_dtype) + { + gpu_dp_inst_t uniConvertInt16toInt8_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000700, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniConvertInt16toInt8_2x8", &uniConvertInt16toInt8_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } status = vsi_nn_kernel_gpu_config( node, &gpu_param ); CHECK_STATUS_FAIL_GOTO(status, final ); @@ -209,9 +231,13 @@ static vsi_status _query_kernel return VSI_FAILURE; } - if (BOOL8 == in_dtype && BOOL8 == out_dtype) + if (BOOL8 == in_dtype) { in_dtype = I8; + } + + if (BOOL8 == out_dtype) + { out_dtype = I8; } diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c index f368c97..c03e942 100644 --- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -56,6 +56,7 @@ __BEGIN_DECLS #define KERNEL_SOURCE_12 "matrixmul_u8u8_f16" #define KERNEL_SOURCE_13 "matrixmul_i16" #define KERNEL_SOURCE_14 "matrixmul_f16i16_i16" +#define KERNEL_SOURCE_15 "matrixmul_bf16" #define HASH_MATRIX_MUL_KEY(_input0_type, _input1_type, _output_type, _trans_a, _trans_b) \ ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_trans_a << 4) | (_trans_b)) @@ -110,6 +111,7 @@ static const struct { TENSOR_MATRIX_MUL_KERNELS(I8, F16, F16, KERNEL_SOURCE_8) TENSOR_MATRIX_MUL_KERNELS(I16, F16, F16, KERNEL_SOURCE_8) TENSOR_MATRIX_MUL_KERNELS(F16, F16, F16, KERNEL_SOURCE_2) + TENSOR_MATRIX_MUL_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15) TENSOR_MATRIX_MUL_KERNELS(F16, F16, U8, KERNEL_SOURCE_11) TENSOR_MATRIX_MUL_KERNELS(F16, F16, I8, KERNEL_SOURCE_11) TENSOR_MATRIX_MUL_KERNELS(F16, F16, I16, KERNEL_SOURCE_11) @@ -119,6 +121,7 @@ static const struct { TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8, U8, KERNEL_SOURCE_4) TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, F16, KERNEL_SOURCE_5) TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, U8, KERNEL_SOURCE_5) + TENSOR_MATRIX_MUL_TRANSB_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15) TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8, U8, U8, KERNEL_SOURCE_7) TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8, I8, I8, KERNEL_SOURCE_7) TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, I16, I16, KERNEL_SOURCE_7) @@ -126,6 +129,7 @@ static const struct { TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8, F16, I8, KERNEL_SOURCE_7) TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, F16, I16, KERNEL_SOURCE_7) TENSOR_MATRIX_MUL_TRANSA_KERNELS(F16, F16, F16, KERNEL_SOURCE_7) + TENSOR_MATRIX_MUL_TRANSA_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15) }; /* @@ -587,6 +591,36 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) 0x00000600, // AccumType, ConstantType, and PostShift 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; float scaleIn0divOut = src0Scale / dstScale; float scaleIn1divOut = src1Scale / dstScale; @@ -936,6 +970,22 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 0, 0 ): + case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 0, 1 ): + case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 1, 0 ): + case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 1, 1 ): + case _PACK_SELECT_KEY( BF16, BF16, BF16, 1, 0, 0 ): + case _PACK_SELECT_KEY( BF16, BF16, BF16, 1, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; case _PACK_SELECT_KEY( F16, F16, F16, 0, 0, 1 ): { status = vsi_nn_kernel_gpu_add_param( node, diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c index 3c76c65..5bade3b 100644 --- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c @@ -64,6 +64,10 @@ __BEGIN_DECLS #define KERNEL_NAME_MAXIMUM_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.maximum_F16F16toI8_2D") #define KERNEL_NAME_MAXIMUM_F16F16TOI16 CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16") #define KERNEL_NAME_MAXIMUM_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16_2D") +#define KERNEL_NAME_MAXIMUM_I16I16TOU8 CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8") +#define KERNEL_NAME_MAXIMUM_I16I16TOU8_2D CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8_2D") +#define KERNEL_NAME_MAXIMUM_U8U8TOI16 CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16") +#define KERNEL_NAME_MAXIMUM_U8U8TOI16_2D CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16_2D") #define KERNEL_SOURCE_1 "maximum", #define KERNEL_SOURCE_2 "maximum_fp16", @@ -109,6 +113,7 @@ static const struct { TENSOR_MAX_KERNELS(F16, F16, I8, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS(I8, I8, I8, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS(U8, U8, I16, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS(I16, I16, I16, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS(F16, F16, U8, KERNEL_SOURCE_2) @@ -120,12 +125,14 @@ static const struct { TENSOR_MAX_KERNELS(I16, F16, I16, KERNEL_SOURCE_3) TENSOR_MAX_KERNELS(I16, F16, F16, KERNEL_SOURCE_3) TENSOR_MAX_KERNELS(F16, F16, I16, KERNEL_SOURCE_3) + TENSOR_MAX_KERNELS(I16, I16, U8, KERNEL_SOURCE_3) TENSOR_MAX_KERNELS_2D_HALF(F16, F16, F16, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MAX_KERNELS_2D(U8, U8, I16, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_1) TENSOR_MAX_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_2) @@ -137,6 +144,7 @@ static const struct { TENSOR_MAX_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_3) TENSOR_MAX_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_3) TENSOR_MAX_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3) + TENSOR_MAX_KERNELS_2D(I16, I16, U8, KERNEL_SOURCE_3) }; static vx_param_description_t kernel_param_def[] = @@ -192,6 +200,14 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { in0_fl = (uint8_t)attr[0]->dfp.fl; + if (in0_fl > 0) + { + src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl); + } + else + { + src0Scale = (float)((int64_t)1 << -in0_fl); + } } else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM) @@ -203,6 +219,14 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) { in1_fl = (uint8_t)attr[1]->dfp.fl; + if (in1_fl > 0) + { + src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl); + } + else + { + src0Scale = (float)((int64_t)1 << -in1_fl); + } } else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM) @@ -242,7 +266,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) attr[1]->dtype, attr[2]->dtype ); if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16) - || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) ) + || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) + || (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) ) { gpu_param.global_scale[0] = 8; gpu_param.global_scale[1] = 1; @@ -384,6 +409,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) case _PACK_SELECT_KEY( U8, U8, U8 ): case _PACK_SELECT_KEY( U8, F16, U8 ): case _PACK_SELECT_KEY( F16, F16, U8 ): + case _PACK_SELECT_KEY( U8, U8, I16 ): + case _PACK_SELECT_KEY( I16, I16, U8 ): { uint16_t M0 = 0; uint16_t M1 = 0; @@ -427,12 +454,15 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); CHECK_STATUS_FAIL_GOTO(status, final ); - if (attr[0]->dtype == U8) + if (attr[0]->dtype == U8 || attr[0]->dtype == I16) { status = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + if (attr[0]->dtype != I16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + } status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); CHECK_STATUS_FAIL_GOTO(status, final ); } @@ -461,8 +491,11 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 ); status = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + if (attr[0]->dtype != I16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + } CHECK_STATUS_FAIL_GOTO(status, final ); } } @@ -751,7 +784,6 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM, tmp_inputs, 2, outputs, 1 ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); - } } return node; @@ -760,4 +792,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( maximum, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c index 16be973..9a64243 100644 --- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c @@ -64,6 +64,10 @@ __BEGIN_DECLS #define KERNEL_NAME_MINIMUM_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.minimum_F16F16toI8_2D") #define KERNEL_NAME_MINIMUM_F16F16TOI16 CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16") #define KERNEL_NAME_MINIMUM_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16_2D") +#define KERNEL_NAME_MINIMUM_I16I16TOU8 CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8") +#define KERNEL_NAME_MINIMUM_I16I16TOU8_2D CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8_2D") +#define KERNEL_NAME_MINIMUM_U8U8TOI16 CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16") +#define KERNEL_NAME_MINIMUM_U8U8TOI16_2D CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16_2D") #define KERNEL_SOURCE_1 "minimum", #define KERNEL_SOURCE_2 "minimum_fp16", @@ -109,6 +113,7 @@ static const struct { TENSOR_MIN_KERNELS(F16, F16, I8, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS(I8, I8, I8, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS(U8, U8, I16, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS(I16, I16, I16, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS(F16, F16, U8, KERNEL_SOURCE_2) @@ -120,12 +125,14 @@ static const struct { TENSOR_MIN_KERNELS(I16, F16, I16, KERNEL_SOURCE_3) TENSOR_MIN_KERNELS(I16, F16, F16, KERNEL_SOURCE_3) TENSOR_MIN_KERNELS(F16, F16, I16, KERNEL_SOURCE_3) + TENSOR_MIN_KERNELS(I16, I16, U8, KERNEL_SOURCE_3) TENSOR_MIN_KERNELS_2D_HALF(F16, F16, F16, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_1) + TENSOR_MIN_KERNELS_2D(U8, U8, I16, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_1) TENSOR_MIN_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_2) @@ -137,6 +144,7 @@ static const struct { TENSOR_MIN_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_3) TENSOR_MIN_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_3) TENSOR_MIN_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3) + TENSOR_MIN_KERNELS_2D(I16, I16, U8, KERNEL_SOURCE_3) }; static vx_param_description_t kernel_param_def[] = @@ -192,6 +200,14 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { in0_fl = (uint8_t)attr[0]->dfp.fl; + if (in0_fl > 0) + { + src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl); + } + else + { + src0Scale = (float)((int64_t)1 << -in0_fl); + } } else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM) @@ -203,6 +219,14 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) { in1_fl = (uint8_t)attr[1]->dfp.fl; + if (in1_fl > 0) + { + src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl); + } + else + { + src0Scale = (float)((int64_t)1 << -in1_fl); + } } else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM) @@ -242,7 +266,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) attr[1]->dtype, attr[2]->dtype ); if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16) - || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) ) + || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) + || (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) ) { gpu_param.global_scale[0] = 8; gpu_param.global_scale[1] = 1; @@ -384,6 +409,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) case _PACK_SELECT_KEY( U8, U8, U8 ): case _PACK_SELECT_KEY( U8, F16, U8 ): case _PACK_SELECT_KEY( F16, F16, U8 ): + case _PACK_SELECT_KEY( U8, U8, I16 ): + case _PACK_SELECT_KEY( I16, I16, U8 ): { uint16_t M0 = 0; uint16_t M1 = 0; @@ -427,12 +454,15 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); CHECK_STATUS_FAIL_GOTO(status, final ); - if (attr[0]->dtype == U8) + if (attr[0]->dtype == U8 || attr[0]->dtype == I16) { status = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + if (attr[0]->dtype != I16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + } status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); CHECK_STATUS_FAIL_GOTO(status, final ); } @@ -461,8 +491,11 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 ); status = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + if (attr[0]->dtype != I16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + } CHECK_STATUS_FAIL_GOTO(status, final ); } } @@ -751,7 +784,6 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM, tmp_inputs, 2, outputs, 1 ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); - } } return node; @@ -760,4 +792,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( minimum, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c index cf540bc..bc45fc0 100644 --- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c @@ -101,14 +101,17 @@ static const struct { TENSOR_MOMENTS_KERNELS(I8, F16, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS(I16, F16, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS(F16, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(BF16,BF16,0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS(U8, F16, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS(I8, F16, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS(I16, F16, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS(F16, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(BF16,BF16,1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS(U8, F16, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(I8, F16, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(I16, F16, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(F16, F16, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(BF16,BF16,2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(U8, U8, 0, KERNEL_SOURCE_6) TENSOR_MOMENTS_KERNELS(U8, U8, 1, KERNEL_SOURCE_6) TENSOR_MOMENTS_KERNELS(U8, U8, 2, KERNEL_SOURCE_6) @@ -116,26 +119,31 @@ static const struct { TENSOR_MOMENTS_TWO_AXIS_KERNELS(I8, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(I16, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,BF16,0, 1, KERNEL_SOURCE_7) TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, U8, 0, 1, KERNEL_SOURCE_6) TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F16, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(I8, F16, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(I16, F16, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(BF16,BF16,0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, U8, 0, 1, 2, KERNEL_SOURCE_7) TENSOR_MOMENTS_KERNELS_2D(U8, F16, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS_2D(I8, F16, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS_2D(I16, F16, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS_2D(F16, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS_2D(BF16,BF16,0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS_2D(U8, F16, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS_2D(I8, F16, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS_2D(I16, F16, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS_2D(F16, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS_2D(BF16,BF16,1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS_2D(U8, U8, 0, KERNEL_SOURCE_6) TENSOR_MOMENTS_KERNELS_2D(U8, U8, 1, KERNEL_SOURCE_6) TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I8, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I16, F16, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(F16, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(BF16,BF16,0, 1, KERNEL_SOURCE_7) TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8, U8, 0, 1, KERNEL_SOURCE_6) }; @@ -461,6 +469,36 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; switch( pack_key ) { @@ -494,6 +532,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( BF16, BF16, 1, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", + &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", + &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", + &uniExtractOddData_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; case _PACK_SELECT_KEY( U8, F16, 1, 1): case _PACK_SELECT_KEY( I8, F16, 1, 1): case _PACK_SELECT_KEY( I16, F16, 1, 1): @@ -518,6 +568,16 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( BF16, BF16, 1, 1): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", + &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", + &uniExtractOddData_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; case _PACK_SELECT_KEY( U8, F16, 1, 2): case _PACK_SELECT_KEY( I8, F16, 1, 2): case _PACK_SELECT_KEY( I16, F16, 1, 2): @@ -542,6 +602,15 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( BF16, BF16, 1, 2): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", + &uniExtractOddData_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; case _PACK_SELECT_KEY( U8, F16, 2, 0): case _PACK_SELECT_KEY( I8, F16, 2, 0): case _PACK_SELECT_KEY( I16, F16, 2, 0): @@ -597,6 +666,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( BF16, BF16, 2, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", + &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", + &uniExtractOddData_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; case _PACK_SELECT_KEY( F16, F16, 3, 0): { status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); @@ -608,6 +689,19 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( BF16, BF16, 3, 0): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", + &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", + &uniExtractOddData_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; case _PACK_SELECT_KEY( U8, U8, 1, 0): case _PACK_SELECT_KEY( U8, U8, 1, 1): case _PACK_SELECT_KEY( U8, U8, 1, 2): diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c index 60de16a..bc78fd3 100644 --- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c @@ -68,27 +68,29 @@ typedef struct static const _kernel_map_type _one_hot_kernel_map[] = { // Register kernel here - PACK_ONE_HOT_KERNEL_3D( U8, U8 ), - PACK_ONE_HOT_KERNEL_3D( U8, F16 ), - PACK_ONE_HOT_KERNEL_3D( I8, I8 ), - PACK_ONE_HOT_KERNEL_3D( I8, F16 ), - PACK_ONE_HOT_KERNEL_3D( I16, I16 ), - PACK_ONE_HOT_KERNEL_3D( I16, F16 ), - PACK_ONE_HOT_KERNEL_3D( F16, F16 ), - PACK_ONE_HOT_KERNEL_3D( F16, I16 ), - PACK_ONE_HOT_KERNEL_3D( F16, U8 ), - PACK_ONE_HOT_KERNEL_3D( F16, I8 ), + PACK_ONE_HOT_KERNEL_3D( U8, U8 ), + PACK_ONE_HOT_KERNEL_3D( U8, F16 ), + PACK_ONE_HOT_KERNEL_3D( I8, I8 ), + PACK_ONE_HOT_KERNEL_3D( I8, F16 ), + PACK_ONE_HOT_KERNEL_3D( I16, I16 ), + PACK_ONE_HOT_KERNEL_3D( I16, F16 ), + PACK_ONE_HOT_KERNEL_3D( F16, F16 ), + PACK_ONE_HOT_KERNEL_3D( F16, I16 ), + PACK_ONE_HOT_KERNEL_3D( F16, U8 ), + PACK_ONE_HOT_KERNEL_3D( F16, I8 ), + PACK_ONE_HOT_KERNEL_3D( BF16, BF16 ), - PACK_ONE_HOT_KERNEL_2D( U8, U8 ), - PACK_ONE_HOT_KERNEL_2D( U8, F16 ), - PACK_ONE_HOT_KERNEL_2D( I8, I8 ), - PACK_ONE_HOT_KERNEL_2D( I8, F16 ), - PACK_ONE_HOT_KERNEL_2D( I16, I16 ), - PACK_ONE_HOT_KERNEL_2D( I16, F16 ), - PACK_ONE_HOT_KERNEL_2D( F16, F16 ), - PACK_ONE_HOT_KERNEL_2D( F16, I16 ), - PACK_ONE_HOT_KERNEL_2D( F16, U8 ), - PACK_ONE_HOT_KERNEL_2D( F16, I8 ), + PACK_ONE_HOT_KERNEL_2D( U8, U8 ), + PACK_ONE_HOT_KERNEL_2D( U8, F16 ), + PACK_ONE_HOT_KERNEL_2D( I8, I8 ), + PACK_ONE_HOT_KERNEL_2D( I8, F16 ), + PACK_ONE_HOT_KERNEL_2D( I16, I16 ), + PACK_ONE_HOT_KERNEL_2D( I16, F16 ), + PACK_ONE_HOT_KERNEL_2D( F16, F16 ), + PACK_ONE_HOT_KERNEL_2D( F16, I16 ), + PACK_ONE_HOT_KERNEL_2D( F16, U8 ), + PACK_ONE_HOT_KERNEL_2D( F16, I8 ), + PACK_ONE_HOT_KERNEL_2D( BF16, BF16 ), }; @@ -274,6 +276,51 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer) "depth", &depth ); CHECK_STATUS_FAIL_GOTO(status, final ); } + break; + case BF16: + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "depth", &depth ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; default: break; } diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c index 2bed1e4..c007a08 100644 --- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c @@ -98,7 +98,6 @@ static const struct { PRELU_KERNELS_2D(I8, F16, F16, _2D, KERNEL_SOURCE0) PRELU_KERNELS_2D(U8, U8, U8, _2D, KERNEL_SOURCE0) PRELU_KERNELS_2D(U8, U8, F16, _2D, KERNEL_SOURCE0) - }; static vx_param_description_t kernel_param_def[] = @@ -199,6 +198,7 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) } else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) { + out_fl = 1; outputZP = (float)attr[2]->asymm.zero_point; input_scale0 = input_scale0 / attr[2]->asymm.scale; } @@ -628,7 +628,6 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM, reshape_tensors, 2, &reshape_tensors[2], 1 ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); - } } @@ -643,4 +642,3 @@ final: __END_DECLS REGISTER_BACKEND_EVIS( prelu, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index a4e4fa9..e3b5582 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -51,11 +51,13 @@ typedef enum UP_2X_HALF, UP_3X_HALF, UP_4X_HALF, + UP_8X_HALF, } _internal_scale_e; #define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_bilinear_"#_input_type #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt" -#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers" +#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_1" +#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_2" #define STR(a) #a // Add kernel hashtable here @@ -81,19 +83,25 @@ typedef enum { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ "_SAME_2x_upsample_half_pixel_centers"), \ - _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) } + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } #define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \ { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ "_SAME_4x_upsample_half_pixel_centers"), \ - _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) } + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_SAME_8x_upsample_half_pixel_centers"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) } #define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \ { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ "_SAME_3x_upsample_half_pixel_centers"), \ - _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) } + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } typedef struct { @@ -120,6 +128,7 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] = PACK_KERNEL_MAP_UP_2X_HALF(U8, U8), PACK_KERNEL_MAP_UP_3X_HALF(U8, U8), PACK_KERNEL_MAP_UP_4X_HALF(U8, U8), + PACK_KERNEL_MAP_UP_8X_HALF(U8, U8), }; @@ -224,6 +233,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) vsi_bool is_2x_up_kernel = FALSE; vsi_bool is_3x_up_kernel = FALSE; vsi_bool is_4x_up_kernel = FALSE; + vsi_bool is_8x_up_kernel = FALSE; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); @@ -280,6 +290,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height); is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height); is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height); + is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height); } if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) @@ -330,7 +341,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) outputZP = 0; } - if (is_2x_up_kernel || is_4x_up_kernel) + if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel) { gpu_param.global_scale[0] = 16; gpu_param.global_scale[1] = 1; @@ -479,6 +490,76 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); CHECK_STATUS_FAIL_GOTO(status, final ); } + else if (is_8x_up_kernel) + { + gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, + 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, + 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, + 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, + 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, + 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, + 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, + 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, + 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) { float dfpScale = input_scale * output_scale; @@ -965,25 +1046,25 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) goto final; } - if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel) + if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel&& !is_8x_up_kernel) { status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value); CHECK_STATUS_FAIL_GOTO(status, final ); } - if (is_2x_up_kernel || is_4x_up_kernel) + if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel) { - gpu_param.global_size[0] = gpu_align_p2((out_width + \ - gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); - gpu_param.global_size[1] = depth; - gpu_param.dim = 2; + gpu_param.global_size[0] = gpu_align_p2((out_width + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = depth; + gpu_param.dim = 2; } else { - gpu_param.global_size[0] = gpu_align_p2((out_width + \ - gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); - gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]; - gpu_param.global_size[2] = depth / gpu_param.global_scale[2]; + gpu_param.global_size[0] = gpu_align_p2((out_width + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]; + gpu_param.global_size[2] = depth / gpu_param.global_scale[2]; } status = vsi_nn_kernel_gpu_config( node, &gpu_param ); @@ -1024,6 +1105,8 @@ static vsi_status _query_kernel && (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ && (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); + vsi_bool is_8x_upsample =(8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ + && (8 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); _internal_scale_e scale_flag = UP; in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); @@ -1032,6 +1115,7 @@ static vsi_status _query_kernel is_2x_upsample &= (in_dtype == U8); is_3x_upsample &= (in_dtype == U8); is_4x_upsample &= (in_dtype == U8); + is_8x_upsample &= (in_dtype == U8); if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0]) { @@ -1047,6 +1131,10 @@ static vsi_status _query_kernel { scale_flag = UP_4X_HALF; } + else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample) + { + scale_flag = UP_8X_HALF; + } else if (is_same_type && is_evis2) { scale_flag = UP_OPT; @@ -1123,7 +1211,6 @@ static vsi_status _query_kernel } return status; - } /* _query_kernel() */ static vsi_nn_tensor_t* _create_scale_tensor @@ -1307,4 +1394,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( resize_bilinear, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c index fe8a9d7..778d1fe 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c @@ -74,6 +74,7 @@ static const struct { TENSOR_SCATTER_ND_KERNELS(I32, U8, U8, KERNEL_SOURCE_1) TENSOR_SCATTER_ND_KERNELS(I32, I16, I16, KERNEL_SOURCE_1) TENSOR_SCATTER_ND_KERNELS(I32, F16, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_KERNELS(I32, BF16,BF16, KERNEL_SOURCE_1) TENSOR_SCATTER_ND_BIG_KERNELS(I32, I8, I8, KERNEL_SOURCE_2) TENSOR_SCATTER_ND_BIG_KERNELS(I32, U8, U8, KERNEL_SOURCE_2) TENSOR_SCATTER_ND_BIG_KERNELS(I32, I16, I16, KERNEL_SOURCE_2) @@ -250,8 +251,45 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer) 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param( node, "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num ); status |= vsi_nn_kernel_gpu_add_param( node, "zeropoint", &output_zp ); status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index 02526f5..91ea9cb 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -67,6 +67,13 @@ static vsi_status _gpu_register vsi_nn_kernel_t* kernel ); +static vsi_status _gpu_register_ext + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel, + const char** resources + ); + static vx_program _create_program_from_executable ( vsi_nn_graph_t* graph, @@ -79,6 +86,13 @@ static vx_program _create_program_from_code vsi_nn_kernel_t* kernel ); +static vx_program _create_program_from_code_ext + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel, + const char** resources + ); + static const uint8_t* _load_internal_executable ( const char* source_name, @@ -104,6 +118,14 @@ static void _kernel_clear_source static vsi_bool _check_shader_support(vsi_nn_graph_t* graph); +static vsi_bool vsi_nn_kernel_is_asymmtric_int8 + ( + vsi_nn_tensor_t** inputs, + size_t input_num, + vsi_nn_tensor_t** outputs, + size_t output_num + ); + static vsi_status VX_CALLBACK _kernel_validator ( vx_node node, @@ -290,7 +312,7 @@ static char* _load_source_code_from_file size_t read_bytes; source = NULL; //TODO: Pack new name - fp = fopen( source_name, "rb" ); + fp = vsi_nn_fopen( source_name, "rb" ); if( NULL == fp ) { VSILOGE("Open program file %s fail.", source_name); @@ -414,6 +436,58 @@ static vx_program _create_program_from_code return program; } /* _create_program_from_code() */ +static vx_program _create_program_from_code_ext + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel, + const char** resources + ) +{ + const vsi_nn_kernel_source_info_t* source_info; + kernel_program_info_t* program_info; + size_t i; + vx_program program = NULL; + source_info = &kernel->gpu.sources[VSI_NN_GPU_SOURCE_FMT_CODE]; + + if( source_info->num == 0 ) + { + VSILOGE("Not executable source found in kernel."); + return NULL; + } + program_info = (kernel_program_info_t*)malloc( + source_info->num * sizeof(kernel_program_info_t) ); + if( !program_info ) + { + VSILOGE("Malloc program memory fail."); + return NULL; + } + memset( program_info, 0, source_info->num * sizeof(kernel_program_info_t) ); + + for( i = 0; i < source_info->num; i ++ ) + { + program_info[i].data = (const void*)(resources[i]); + if( !program_info[i].data ) + { + program_info[i].reserve_mem = (void*)_load_source_code_from_file( + source_info->data[i], &program_info[i].size ); + program_info[i].data = (const void*)program_info[i].reserve_mem; + } + } + program = _create_program( graph->ctx->c, program_info, source_info->num ); + if( program_info ) + { + for( i = 0; i < source_info->num; i ++ ) + { + if( program_info[i].reserve_mem ) + { + free( program_info[i].reserve_mem ); + } + } + free( program_info ); + } + return program; +} /* _create_program_from_code_ext() */ + static vx_program _create_program_from_executable ( vsi_nn_graph_t* graph, @@ -547,6 +621,113 @@ static vsi_status _gpu_register return status; } /* _gpu_register() */ +static vsi_status _gpu_register_ext + ( + vsi_nn_graph_t* graph, + vsi_nn_kernel_t* kernel, + const char** resources + ) +{ + vsi_status status; + vx_kernel_description_t* info; + vx_kernel obj; + vsi_nn_context_t context; + vx_program program = NULL; + const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt; + +#define MAX_BUILDPROGRAM_LEN 1024 + char cmd[MAX_BUILDPROGRAM_LEN] = { 0 }; + size_t cost_bytes = 0; + + memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN ); + context = graph->ctx; + + status = VSI_FAILURE; + info = &(kernel->info); + + switch( active_fmt ) + { + case VSI_NN_GPU_SOURCE_FMT_CODE: + program = _create_program_from_code_ext( graph, kernel,resources ); + break; + case VSI_NN_GPU_SOURCE_FMT_EXECUTABLE: + program = _create_program_from_executable( graph, kernel ); + break; + default: + VSILOGE("Unknown source format %d", kernel->gpu.active_source_fmt); + break; + } + if( NULL == program ) + { + return status; + } + + if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE ) + { + // set default evis version is 2 + if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type ) + { + cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, + "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", + context->config.use_40bits_va ); + } + } + else + { + cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, + "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", + context->config.evis.ver, context->config.use_40bits_va ); + } + // Pack build option + if( kernel->gpu.sources[active_fmt].build_option.data ) + { + vsi_nn_kernel_build_option_t * option = &kernel->gpu.sources[active_fmt].build_option; + if( MAX_BUILDPROGRAM_LEN - cost_bytes > strlen( option->data ) + 1 ) + { + snprintf( &cmd[cost_bytes], MAX_BUILDPROGRAM_LEN - cost_bytes, + " %s", option->data ); + } + else + { + VSILOGE("Build option is too long!"); + VSI_ASSERT( FALSE ); + } + } + + status = vxBuildProgram( program, cmd ); + + if( VSI_SUCCESS != status ) + { + VSILOGE("Build program fail."); + return status; + } + + obj = vxAddKernelInProgram( + program, + info->name, + info->enumeration, + info->numParams, + info->validate, + info->initialize, + info->deinitialize + ); + + if( obj ) + { + status = _kernel_init_obj( info, obj ); + //vxReleaseKernel( &obj ); + } + else + { + VSILOGE( "Add kernel %s fail.", info->name ); + } + if( program ) + { + vxReleaseProgram( &program ); + } + return status; +} /* _gpu_register_ext() */ + static vsi_status _kernel_init_obj ( vx_kernel_description_t* info, @@ -620,6 +801,19 @@ vsi_status vsi_nn_kernel_register return status; } /* vsi_nn_kernel_register() */ +vsi_status vsi_nn_kernel_register_ext + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_t * kernel, + const char** resources + ) +{ + vsi_status status; + status = VSI_FAILURE; + status = _gpu_register_ext( graph, kernel,resources ); + return status; +} /* vsi_nn_kernel_register_ext */ + vsi_nn_kernel_node_t vsi_nn_kernel_create_node ( vsi_nn_graph_t* graph, @@ -667,7 +861,6 @@ vsi_nn_kernel_node_t vsi_nn_kernel_create_node status = vxGetStatus( (vx_reference)obj ); if (VSI_SUCCESS != status) { - fprintf(stderr, "\n"); // TODO: This is a hack for driver msg /* Register kernel */ status = vsi_nn_kernel_register( graph, kernel ); if( VSI_SUCCESS != status ) @@ -712,6 +905,92 @@ vsi_nn_kernel_node_t vsi_nn_kernel_create_node return (vsi_nn_kernel_node_t)node; } /* vsi_nn_kernel_create_node() */ +vsi_nn_kernel_node_t vsi_nn_kernel_create_node_ext + ( + vsi_nn_graph_t * graph, + vsi_nn_kernel_t * kernel, + const char** resources + ){ + vsi_status status; + vx_context ctx; + vx_kernel obj; + vx_node node; + vx_kernel_description_t* info; + + info = &(kernel->info); + // Validate kernel + if( !info->initialize ) + { + VSILOGE("Kernel %s initializer is NULL", info->name); + return NULL; + } + if( !info->validate ) + { + VSILOGE("Kernel %s validator is NULL", info->name); + return NULL; + } + if( !info->deinitialize ) + { + VSILOGE("Kernel %s deinitializer is NULL", info->name); + return NULL; + } + if( info->enumeration == KERNEL_ID_PLACEHOLDER ) + { + //VSILOGD("Kernel id: %#x, %#x", kernel->unique_id, info->enumeration); + info->enumeration = (vx_enum)kernel->unique_id; + } + + ctx = vxGetContext( (vx_reference)graph->g ); + + obj = vxGetKernelByName( ctx, info->name ); + status = vxGetStatus( (vx_reference)obj ); + if (VSI_SUCCESS != status) + { + fprintf(stderr, "\n"); // TODO: This is a hack for driver msg + /* Register kernel */ + status = vsi_nn_kernel_register_ext( graph, kernel,resources ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Register client kernel %s fail with %d.", + info->name, status ); + return NULL; + } + else + { + VSILOGD( "Register client kernel %s successfully.", + info->name ); + } + + /* Load kernel */ + obj = vxGetKernelByName( ctx, info->name ); + status = vxGetStatus( (vx_reference)obj ); + } + if( VSI_SUCCESS != status ) + { + VSILOGE( "Load client kernel %s fail with %d.", + info->name, status ); + return NULL; + } + node = vxCreateGenericNode( graph->g, obj ); + vxReleaseKernel( &obj ); + status = vxGetStatus( (vx_reference)node ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Load client node from kernel %s fail with %d.", + info->name, status ); + return NULL; + } + if( node ) + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_REPLICATE; + border.constant_value.U32 = 0; + status |= vxSetNodeAttribute( node, VX_NODE_BORDER, &border, sizeof(border) ); + } + return (vsi_nn_kernel_node_t)node; +} /* vsi_nn_kernel_create_node_ext() */ + vsi_status vsi_nn_kernel_node_set_border (vsi_nn_kernel_node_t node, vx_border_t* border) @@ -987,7 +1266,8 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector /* Skip evis and cl when disable shader */ if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL) - && _check_shader_support(graph) == FALSE) + && ( _check_shader_support(graph) == FALSE || + vsi_nn_kernel_is_asymmtric_int8(inputs, input_num, outputs, output_num) ) ) { continue; } @@ -1292,3 +1572,38 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph) return FALSE; } + +static vsi_bool vsi_nn_kernel_is_asymmtric_int8 + ( + vsi_nn_tensor_t** inputs, + size_t input_num, + vsi_nn_tensor_t** outputs, + size_t output_num + ) +{ + size_t i = 0; + + for (i = 0; i < input_num; i++) + { + if ( inputs[i] && + inputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && + inputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + ) + { + return TRUE; + } + } + + for (i = 0; i < output_num; i++) + { + if ( outputs[i] && + outputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && + outputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + ) + { + return TRUE; + } + } + + return FALSE; +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c index da0de6e..105027d 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c @@ -361,7 +361,6 @@ vsi_bool vsi_nn_kernel_optimize_softmax_shape return ret; } /* vsi_nn_kernel_optimize_softmax_shape() */ - typedef enum { TILE_STATE_AXIS_X = 0, @@ -611,4 +610,47 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape *out_rank = vsi_nn_min(dim_num, 3); return TRUE; +} + +vsi_bool vsi_nn_kernel_optimize_group_norm_shape + ( + const vsi_size_t* shape, const uint32_t rank, int32_t groups, + int32_t is_sp_kernel, vsi_size_t* out_shape + ) +{ + vsi_status status = VSI_SUCCESS; + uint32_t i = 0; + vsi_size_t out_rank = 0; + vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; + group_shape[0] = shape[0]; + group_shape[1] = shape[1]; + group_shape[2] = shape[2] / groups; + + vsi_nn_kernel_optimize_element_shape( group_shape, 3, out_shape, &out_rank ); + + if (!is_sp_kernel && out_shape[1] == 1 && out_rank < 3) + { + out_shape[1] = groups; + out_shape[2] = 1; + out_shape[3] = 1; + for (i = 3; i < rank; i++) + { + out_shape[3] = out_shape[3] * shape[i]; + } + } + else if (out_rank == 2) + { + out_shape[2] = groups; + out_shape[3] = 1; + for (i = 3; i < rank; i++) + { + out_shape[3] = out_shape[3] * shape[i]; + } + } + else + { + status = VSI_FAILURE; + } + + return status; } \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c new file mode 100644 index 0000000..955c61d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c @@ -0,0 +1,84 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include +#include "utils/vsi_nn_dtype_util_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_lut.h" + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vx_node node = NULL; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + + node = vxBatchNormalizationLayer( + graph->g, + eps, + inputs[1]->t, + inputs[2]->t, + inputs[3]->t, + inputs[4]->t, + inputs[0]->t, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* _setup() */ + +#define REGISTER_BATCH_NORM_OPENVX_KERNEL(KERNEL_NAME) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel); \ + } \ + REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +REGISTER_BATCH_NORM_OPENVX_KERNEL( batch_norm ) + +#undef REGISTER_BATCH_NORM_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c index 8cc0794..89c8fa4 100644 --- a/src/tim/vx/internal/src/kernel/vx/convolutional.c +++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c @@ -181,6 +181,51 @@ static vsi_bool _build_vx_conv3d_param } /* _build_vx_conv2d_param() */ #endif +#if VX_DECONV_3D_API_SUPPORT +static vsi_bool _build_vx_deconv3d_param + ( + vx_nn_deconvolution_3d_params_t * param, + int32_t stride_d, int32_t stride_h, int32_t stride_w, + int32_t pad_d_front, int32_t pad_d_end, + int32_t pad_h_front, int32_t pad_h_end, + int32_t pad_w_front, int32_t pad_w_end, + int32_t outpadding_d, int32_t outpadding_h, int32_t outpadding_w, + int32_t group, vsi_enum overflow_policy, + vsi_enum rounding_policy, vsi_enum down_scale_size_rounding + ) +{ + VSI_ASSERT( stride_d > 0 ); + VSI_ASSERT( stride_h > 0 ); + VSI_ASSERT( stride_w > 0 ); + VSI_ASSERT( outpadding_d >= 0 ); + VSI_ASSERT( outpadding_h >= 0 ); + VSI_ASSERT( outpadding_w >= 0 ); + VSI_ASSERT( group >= 0 ); + + param->padding_d_front = (uint32_t)pad_d_front; + param->padding_d_rear = (uint32_t)pad_d_end; + param->padding_h_top = (uint32_t)pad_h_front; + param->padding_h_bottom = (uint32_t)pad_h_end; + param->padding_w_left = (uint32_t)pad_w_front; + param->padding_w_right = (uint32_t)pad_w_end; + + param->a_w = outpadding_w; + param->a_h = outpadding_h; + param->a_d = outpadding_d; + + param->overflow_policy = (vx_enum)overflow_policy; + param->rounding_policy = (vx_enum)rounding_policy; + param->down_scale_size_rounding = (vx_enum)down_scale_size_rounding; + param->channel_group = group; + + param->stride_w = (uint32_t)stride_w; + param->stride_h = (uint32_t)stride_h; + param->stride_d = (uint32_t)stride_d; + + return TRUE; +} /* _build_vx_deconv3d_param() */ +#endif + static vx_tensor _expand_tensor_dim ( vx_tensor tensor, vsi_ssize_t * shape, size_t rank, vsi_ssize_t expand_dim ) { @@ -242,7 +287,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) vx_node node = NULL; vx_nn_convolution_params_ext2_t vxparam; vx_tensor temp_tensors[3] = { NULL }; - int i; + uint32_t i = 0; _build_vx_conv2d_param( &vxparam, @@ -270,7 +315,6 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) { uint8_t * data = NULL; vsi_nn_tensor_attr_t attr; - uint32_t i; data = vsi_nn_ConvertTensorToData( graph, inputs[1] ); CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final ); @@ -317,7 +361,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) vx_node node = NULL; vx_nn_convolution_params_ext2_t vxparam; vx_tensor temp_tensors[3] = { NULL }; - int32_t i; + uint32_t i = 0; vsi_bool need_explicit_padding = FALSE; _build_vx_conv2d_param( @@ -344,7 +388,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) new_w_shape[0] = inputs[1]->attr.size[0]; new_w_shape[1] = 1; new_w_shape[2] = 1; - for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++) + for (i = 1; i < inputs[1]->attr.dim_num; i++) { new_w_shape[2] *= inputs[1]->attr.size[i]; } @@ -358,7 +402,6 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) { uint8_t * data = NULL; vsi_nn_tensor_attr_t attr; - uint32_t i; data = vsi_nn_ConvertTensorToData( graph, inputs[1] ); CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final ); @@ -576,4 +619,41 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d ) return (vsi_nn_kernel_node_t)node; } /* depthwise_conv2d*/ -#undef REGISTER_CONV_OPENVX_KERNEL +REGISTER_CONV_OPENVX_KERNEL( deconv3d ) +{ + vx_node node = NULL; +#if VX_DECONV_3D_API_SUPPORT + vx_nn_deconvolution_3d_params_t vxparam; + memset(&vxparam, 0, sizeof(vxparam)); + + _build_vx_deconv3d_param( + &vxparam, + vsi_nn_kernel_param_get_int32(params, "stride_d"), + vsi_nn_kernel_param_get_int32(params, "stride_h"), + vsi_nn_kernel_param_get_int32(params, "stride_w"), + vsi_nn_kernel_param_get_int32(params, "pad_front"), + vsi_nn_kernel_param_get_int32(params, "pad_end"), + vsi_nn_kernel_param_get_int32(params, "pad_top"), + vsi_nn_kernel_param_get_int32(params, "pad_bottom"), + vsi_nn_kernel_param_get_int32(params, "pad_left"), + vsi_nn_kernel_param_get_int32(params, "pad_right"), + vsi_nn_kernel_param_get_int32(params, "outpadding_w"), + vsi_nn_kernel_param_get_int32(params, "outpadding_h"), + vsi_nn_kernel_param_get_int32(params, "outpadding_w"), + vsi_nn_kernel_param_get_int32(params, "group"), + vsi_nn_kernel_param_get_int32(params, "overflow_policy"), + vsi_nn_kernel_param_get_int32(params, "rounding_policy"), + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + ); + + node = vxDeconv3dLayer( graph->g, + inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL, + &vxparam, + sizeof( vxparam), + outputs[0]->t + ); +#endif + return (vsi_nn_kernel_node_t)node; +} /* deconv3d */ + +#undef REGISTER_CONV_OPENVX_KERNEL \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c new file mode 100644 index 0000000..d67751b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c @@ -0,0 +1,113 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" + +#define REGISTER_PAD2_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_PAD2_OPENVX_KERNEL( pad2 ) +{ + vx_node node = NULL; + vx_nn_pad_params_t param; + size_t dim_num = 0; + int32_t* front_size = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "front_size", &dim_num); + int32_t* back_size = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "back_size", &dim_num); + int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "pad_mode"); + int32_t pad_front_array[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t pad_back_array[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_nn_tensor_t *convert_tensor = NULL; + float const_val = vsi_nn_kernel_param_get_float32(params, "const_val"); + + memset(¶m, 0, sizeof(param)); + memset(pad_front_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + memset(pad_back_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + memcpy(pad_front_array, front_size, sizeof(int32_t) * dim_num); + memcpy(pad_back_array, back_size, sizeof(int32_t) * dim_num); + + param.pad_mode = pad_mode; + param.pad_const = vxCreateScalar( graph->ctx->c, VX_TYPE_FLOAT32, &const_val ); + param.numViewDimensions = (uint8_t)vsi_nn_max(dim_num, 2); + param.pad_front_array = pad_front_array; + param.pad_back_array = pad_back_array; + + if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) + { + vsi_nn_tensor_attr_t attr; + memcpy( &attr, &outputs[0]->attr, sizeof( attr ) ); + memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) ); + attr.vtl = FALSE; + attr.is_const = FALSE; + + convert_tensor = vsi_nn_CreateTensor(graph, &attr); + + node = vxTensorCopyNode( + graph->g, + inputs[0]->t, + convert_tensor->t + ); + } + else + { + convert_tensor = vsi_nn_reshape_tensor( graph, + inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num ); + } + + node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, ¶m, sizeof(param) ); + + vxReleaseScalar( ¶m.pad_const ); + vsi_safe_release_tensor(convert_tensor); + + return (vsi_nn_kernel_node_t)node; +} /* pad2() */ + +#undef REGISTER_PAD2_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl b/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl new file mode 100644 index 0000000..dddb09b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/clip_BF16.cl @@ -0,0 +1,37 @@ +#pragma OPENCL EXTENSION CL_VIV_asm : enable + +__kernel void clip_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float minData, + float maxData) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + uint4 src0 = read_imageui(input, coord); + src0 = src0 << 16; + float4 src; + _viv_asm(COPY, src, src0, 16); + float4 dst0 = clamp(src, minData, maxData); + uint4 dst; + _viv_asm(COPY, dst, dst0, 16); + dst = dst >> 16; + write_imageui(output, coord, dst); +} + +__kernel void clip_BF16toBF16_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float minData, + float maxData) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + uint4 src0 = read_imageui(input, coord); + src0 = src0 << 16; + float4 src; + _viv_asm(COPY, src, src0, 16); + float4 dst0 = clamp(src, minData, maxData); + uint4 dst; + _viv_asm(COPY, dst, dst0, 16); + dst = dst >> 16; + write_imageui(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl b/src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl new file mode 100644 index 0000000..12f6977 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/depth2space_crd.cl @@ -0,0 +1,17 @@ + +__kernel void depth2space_crd_F32toF32( + image2d_array_t input, image2d_array_t output, int block_size) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(gidx, gidy, gidz, 0); + int block_e2 = block_size * block_size; + ushort blk = (ushort)block_size; + int inx = (int)((ushort)gidx / blk); + int iny = (int)((ushort)gidy / blk); + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; + int4 coord_in = (int4)(inx, iny, inz, 0); + float4 data = read_imagef(input, coord_in); + write_imagef(output, coord_out, data); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl index 5b90eb1..55b63cb 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl @@ -3,6 +3,11 @@ float eltwise_unary_sin(float x, float alpha, float beta) return native_sin(x); } +float eltwise_unary_cos(float x, float alpha, float beta) +{ + return native_cos(x); +} + #define logE (1.44269502f) #define twoLogE (logE * 2.0f) float eltwise_unary_exp(float x, float alpha, float beta) @@ -135,6 +140,7 @@ __kernel void func_name##_F32toF32 \ write_imagef(output, coord, dst.xxxx); \ } ELTWISE_UNARY_F32(sin) +ELTWISE_UNARY_F32(cos) ELTWISE_UNARY_F32(exp) ELTWISE_UNARY_F32(log) ELTWISE_UNARY_F32(elu) @@ -168,6 +174,7 @@ __kernel void func_name##_F32toF32_2D \ write_imagef(output, coord, dst.xxxx); \ } ELTWISE_UNARY_F32_2D(sin) +ELTWISE_UNARY_F32_2D(cos) ELTWISE_UNARY_F32_2D(exp) ELTWISE_UNARY_F32_2D(log) ELTWISE_UNARY_F32_2D(elu) @@ -202,6 +209,7 @@ __kernel void func_name##_U8toU8 \ write_imageui(output, coord, dst); \ } ELTWISE_UNARY_U8(sin) +ELTWISE_UNARY_U8(cos) ELTWISE_UNARY_U8(exp) ELTWISE_UNARY_U8(log) ELTWISE_UNARY_U8(elu) @@ -236,6 +244,7 @@ __kernel void func_name##_U8toU8_2D \ write_imageui(output, coord, dst); \ } ELTWISE_UNARY_U8_2D(sin) +ELTWISE_UNARY_U8_2D(cos) ELTWISE_UNARY_U8_2D(exp) ELTWISE_UNARY_U8_2D(log) ELTWISE_UNARY_U8_2D(elu) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl index 64f6775..1bacbc0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl @@ -1,7 +1,15 @@ -__kernel void floordiv_F32F32toF32( +__kernel void floordiv_F32F32toF32 + ( __read_only image2d_array_t input, __read_only image2d_array_t input1, - __write_only image2d_array_t output) + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); float4 src0; @@ -12,10 +20,18 @@ __kernel void floordiv_F32F32toF32( write_imagef(output, coord, dst); } -__kernel void floordiv_F32F32toF32_2D( - __read_only image2d_t input, - __read_only image2d_t input1, - __write_only image2d_t output) +__kernel void floordiv_F32F32toF32_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); float4 src0 = read_imagef(input, coord); @@ -24,33 +40,8 @@ __kernel void floordiv_F32F32toF32_2D( write_imagef(output, coord, dst); } -__kernel void floordiv_I32I32toI32( - __read_only image2d_array_t input, - __read_only image2d_array_t input1, - __write_only image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - int4 src0; - int4 src1; - READ_IMAGEI_2DARRAY(src0, input, coord); - READ_IMAGEI_2DARRAY(src1, input1, coord); - int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1))); - write_imagei(output, coord, dst); -} - -__kernel void floordiv_I32I32toI32_2D( - __read_only image2d_t input, - __read_only image2d_t input1, - __write_only image2d_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - int4 src0 = read_imagei(input, coord); - int4 src1 = read_imagei(input1, coord); - int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1))); - write_imagei(output, coord, dst); -} - -__kernel void floordiv_I32I32toU8( +__kernel void floordiv_I32I32toI32 + ( __read_only image2d_array_t input, __read_only image2d_array_t input1, __write_only image2d_array_t output, @@ -59,7 +50,56 @@ __kernel void floordiv_I32I32toU8( float input1Scale, float input1Tail, float outputScale, - float outputTail ) + float outputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 src0; + int4 src1; + READ_IMAGEI_2DARRAY(src0, input, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); + float4 in0 = convert_float4(src0) * input0Scale + input0Tail; + float4 in1 = convert_float4(src1) * input1Scale + input1Tail; + float4 out = floor(in0 / in1) * outputScale + outputTail; + int4 dst = convert_int4(out); + write_imagei(output, coord, dst); +} + +__kernel void floordiv_I32I32toI32_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 src0 = read_imagei(input, coord); + int4 src1 = read_imagei(input1, coord); + float4 in0 = convert_float4(src0) * input0Scale + input0Tail; + float4 in1 = convert_float4(src1) * input1Scale + input1Tail; + float4 out = floor(in0 / in1) * outputScale + outputTail; + int4 dst = convert_int4(out); + write_imagei(output, coord, dst); +} + +__kernel void floordiv_I32I32toU8 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 src0; @@ -73,16 +113,18 @@ __kernel void floordiv_I32I32toU8( write_imageui(output, coord, dst); } -__kernel void floordiv_I32I32toU8_2D( - __read_only image2d_t input, - __read_only image2d_t input1, - __write_only image2d_t output, - float input0Scale, - float input0Tail, - float input1Scale, - float input1Tail, - float outputScale, - float outputTail ) +__kernel void floordiv_I32I32toU8_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); int4 src0 = read_imagei(input, coord); @@ -94,7 +136,8 @@ __kernel void floordiv_I32I32toU8_2D( write_imageui(output, coord, dst); } -__kernel void floordiv_U8U8toU8( +__kernel void floordiv_U8U8toU8 + ( __read_only image2d_array_t input, __read_only image2d_array_t input1, __write_only image2d_array_t output, @@ -103,7 +146,8 @@ __kernel void floordiv_U8U8toU8( float input1Scale, float input1Tail, float outputScale, - float outputTail ) + float outputTail + ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); uint4 src0, src1; @@ -117,16 +161,18 @@ __kernel void floordiv_U8U8toU8( write_imageui(output, coord, dst); } -__kernel void floordiv_U8U8toU8_2D( - __read_only image2d_t input, - __read_only image2d_t input1, - __write_only image2d_t output, - float input0Scale, - float input0Tail, - float input1Scale, - float input1Tail, - float outputScale, - float outputTail ) +__kernel void floordiv_U8U8toU8_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); uint4 src0 = read_imageui(input, coord); @@ -139,7 +185,8 @@ __kernel void floordiv_U8U8toU8_2D( write_imageui(output, coord, dst); } -__kernel void floordiv_U8I32toU8( +__kernel void floordiv_U8I32toU8 + ( __read_only image2d_array_t input, __read_only image2d_array_t input1, __write_only image2d_array_t output, @@ -148,7 +195,8 @@ __kernel void floordiv_U8I32toU8( float input1Scale, float input1Tail, float outputScale, - float outputTail ) + float outputTail + ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); uint4 src0; @@ -163,16 +211,18 @@ __kernel void floordiv_U8I32toU8( write_imageui(output, coord, dst); } -__kernel void floordiv_U8I32toU8_2D( - __read_only image2d_t input, - __read_only image2d_t input1, - __write_only image2d_t output, - float input0Scale, - float input0Tail, - float input1Scale, - float input1Tail, - float outputScale, - float outputTail ) +__kernel void floordiv_U8I32toU8_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); uint4 src0 = read_imageui(input, coord); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl index 1c8caff..49d04e2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl @@ -5,7 +5,8 @@ __kernel void gather_U8toU8( int block_size, int block_num, int axis_num, - int indices_num + int indices_num, + int batch ) { int gidx = get_global_id(0); // block_size @@ -29,7 +30,8 @@ __kernel void gather_F16toF16( int block_size, int block_num, int axis_num, - int indices_num + int indices_num, + int batch ) { int gidx = get_global_id(0); // block_size @@ -53,7 +55,8 @@ __kernel void gather_I32toI32( int block_size, int block_num, int axis_num, - int indices_num + int indices_num, + int batch ) { int gidx = get_global_id(0); // block_size @@ -77,7 +80,8 @@ __kernel void gather_F32toF32( int block_size, int block_num, int axis_num, - int indices_num + int indices_num, + int batch ) { int gidx = get_global_id(0); // block_size diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl new file mode 100644 index 0000000..4ff6ec1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl @@ -0,0 +1,123 @@ +__kernel void gather_batch_U8toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num, + int indices_num, + int batch + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int2 coord_idx = (int2)(gidy, 0); + int4 coord_in = (int4)(gidx, 0, 0, 0); + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); + for(; coord_idx.y < batch;) + { + int4 indice = read_imagei(input1, coord_idx); + coord_idx.y++; + coord_in.y = gidz * axis_num + indice.x; + + uint4 data = read_imageui(input0, coord_in); + coord_in.z++; + write_imageui(output, coord, data); + coord.z++; + } +} + +__kernel void gather_batch_F16toF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num, + int indices_num, + int batch + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int2 coord_idx = (int2)(gidy, 0); + int4 coord_in = (int4)(gidx, 0, 0, 0); + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); + for(; coord_idx.y < batch;) + { + int4 indice = read_imagei(input1, coord_idx); + coord_idx.y++; + coord_in.y = gidz * axis_num + indice.x; + + float4 data = read_imagef(input0, coord_in); + coord_in.z++; + write_imagef(output, coord, data); + coord.z++; + } +} + +__kernel void gather_batch_I32toI32( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num, + int indices_num, + int batch + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int2 coord_idx = (int2)(gidy, 0); + int4 coord_in = (int4)(gidx, 0, 0, 0); + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); + for(; coord_idx.y < batch;) + { + int4 indice = read_imagei(input1, coord_idx); + coord_idx.y++; + coord_in.y = gidz * axis_num + indice.x; + + int4 data = read_imagei(input0, coord_in); + coord_in.z++; + write_imagei(output, coord, data); + coord.z++; + } +} + +__kernel void gather_batch_F32toF32( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num, + int indices_num, + int batch + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int2 coord_idx = (int2)(gidy, 0); + int4 coord_in = (int4)(gidx, 0, 0, 0); + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); + for(; coord_idx.y < batch;) + { + int4 indice = read_imagei(input1, coord_idx); + coord_idx.y++; + coord_in.y = gidz * axis_num + indice.x; + + float4 data = read_imagef(input0, coord_in); + coord_in.z++; + write_imagef(output, coord, data); + coord.z++; + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl index 8b4dd55..effa919 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl @@ -112,6 +112,48 @@ __kernel void moments_axis0_I32toF32( vari.x = sqr * dimRatio * input_scale * input_scale; vari.x = vari.x - mean.x * mean.x; + int2 coord_out = (int2)(gidy, gidz); + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); +} + +__kernel void moments_axis0_BF16toF32( + __read_only image2d_array_t input, + __write_only image2d_t output_mean, + __write_only image2d_t output_vari, + int axis, + int axis_num, + int input_zp, + float input_scale, + int width, + int height, + int chn, + float dimRatio + ) +{ + int gidy = get_global_id(0); + int gidz = get_global_id(1); + + int4 coord0 = (int4)(0, gidy, gidz, 0); + float4 data; + float sum = 0, sqr = 0; + + for(coord0.x = 0; coord0.x < width;) + { + uint4 src0 = read_imageui(input, coord0); + src0 = src0 << 16; + _viv_asm(COPY, data, src0, 16); + coord0.x++; + + sum = sum + data.x; + sqr = sqr + data.x * data.x; + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + int2 coord_out = (int2)(gidy, gidz); write_imagef(output_mean, coord_out, mean); write_imagef(output_vari, coord_out, vari); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl index a89ec8a..05f9e3a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl @@ -172,3 +172,63 @@ __kernel void moments_axis01_I32toF32( write_imagef(output_vari, coord_out, vari); } } + +__kernel void moments_axis01_BF16toF32( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num, int input_zp, float input_scale, + int width, int height, int chn, float dimRatio + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.x = gidx; coord.x < width; coord.x += 16) + { + float tmpSum = 0, tmpSqr = 0; + for(coord.y = 0; coord.y < height;) + { + uint4 src0 = read_imageui(input, coord); + src0 = src0 << 16; + _viv_asm(COPY, data, src0, 16); + coord.y++; + + tmpSum = tmpSum + data.x; + tmpSqr = tmpSqr + data.x * data.x; + } + sqr += tmpSqr; + sum += tmpSum; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(gidz, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl index fa0ce44..44e9809 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl @@ -177,3 +177,64 @@ __kernel void moments_axis012_I32toF32( write_imagef(output_vari, coord_out, vari); } } + +__kernel void moments_axis012_BF16toF32( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num, int input_zp, float input_scale, + int width, int height, int chn, float dimRatio + ) +{ + int gidx = get_global_id(0); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, 0, 0); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + for(coord.z = 0; coord.z < chn; coord.z++) + { + for(coord.x = gidx; coord.x < width; coord.x += 16) + { + float tmpSum = 0, tmpSqr = 0; + for(coord.y = 0; coord.y < height;) + { + uint4 src0 = read_imageui(input, coord); + src0 = src0 << 16; + _viv_asm(COPY, data, src0, 16); + coord.y++; + tmpSum = tmpSum + data.x; + tmpSqr = tmpSqr + data.x * data.x; + } + sqr += tmpSqr; + sum += tmpSum; + } + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl index a18bdc2..191e321 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl @@ -106,6 +106,47 @@ __kernel void moments_axis1_I32toF32( vari.x = sqr * dimRatio * input_scale * input_scale; vari.x = vari.x - mean.x * mean.x; + int2 coord_out = (int2)(gidx, gidz); + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); +} + +__kernel void moments_axis1_BF16toF32( + __read_only image2d_array_t input, + __write_only image2d_t output_mean, + __write_only image2d_t output_vari, + int axis, + int axis_num, + int input_zp, + float input_scale, + int width, + int height, + int chn, + float dimRatio + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + + int4 coord0 = (int4)(gidx, 0, gidz, 0); + float4 data; + float sum = 0, sqr = 0; + + for(coord0.y = 0; coord0.y < height;) + { + uint4 src0 = read_imageui(input, coord0); + src0 = src0 << 16; + _viv_asm(COPY, data, src0, 16); + coord0.y++; + sum = sum + data.x; + sqr = sqr + data.x * data.x; + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + int2 coord_out = (int2)(gidx, gidz); write_imagef(output_mean, coord_out, mean); write_imagef(output_vari, coord_out, vari); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl index 078cf74..8cf72cb 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl @@ -123,4 +123,46 @@ __kernel void moments_axis2_I32toF32( int2 coord_out = (int2)(gidx, gidy); write_imagef(output_mean, coord_out, mean); write_imagef(output_vari, coord_out, vari); -} \ No newline at end of file +} + +__kernel void moments_axis2_BF16toF32( + __read_only image2d_array_t input, + __write_only image2d_t output_mean, + __write_only image2d_t output_vari, + int axis, + int axis_num, + int input_zp, + float input_scale, + int width, + int height, + int chn, + float dimRatio + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + int4 coord0 = (int4)(gidx, gidy, 0, 0); + float4 data; + float sum = 0, sqr = 0; + + for(coord0.z = 0; coord0.z < chn;) + { + uint4 src0 = read_imageui(input, coord0); + src0 = src0 << 16; + _viv_asm(COPY, data, src0, 16); + coord0.z++; + + sum = sum + data.x; + sqr = sqr + data.x * data.x; + } + + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + int2 coord_out = (int2)(gidx, gidy); + write_imagef(output_mean, coord_out, mean); + write_imagef(output_vari, coord_out, vari); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl new file mode 100644 index 0000000..2596e66 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl @@ -0,0 +1,251 @@ +#define TOPK_F32(LOCAL_SIZE0, STAGES) \ +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toF32_I32 \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + __write_only image2d_t indices, \ + int num_stages, \ + int width \ + ) \ + { \ + uint local_id = get_local_id(0); \ + uint work_group_size = get_local_size(0); \ + uint offset = 0; \ + \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + __local float local_data[128]; \ + __local uint local_indices[128]; \ + \ + float left = read_imagef(input, coord.xy).x; \ + coord.z += work_group_size; \ + float data = read_imagef(input, coord.zy).x; \ + float right = coord.z < width ? data : -2147483647.0f; \ + \ + local_data[local_id] = left; \ + local_indices[local_id] = local_id; \ + local_data[local_id + work_group_size] = right; \ + local_indices[local_id + work_group_size] = local_id + work_group_size; \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + for (uint stage = 0; stage < num_stages + 1; ++stage) \ + { \ + uint signo = (local_id >> stage) & 1; \ + \ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ + { \ + uint postShift = (stage - passOfStage); \ + uint pairDistance = 1 << postShift; \ + \ + uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \ + uint right_id = left_id + pairDistance; \ + \ + uint left_idx = local_indices[left_id]; \ + uint right_idx = local_indices[right_id]; \ + \ + float left_elem = local_data[left_id]; \ + float right_elem = local_data[right_id]; \ + \ + if ((left_elem < right_elem) ^ signo) \ + { \ + local_data[left_id] = right_elem; \ + local_data[right_id] = left_elem; \ + \ + local_indices[left_id] = right_idx; \ + local_indices[right_id] = left_idx; \ + } \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + } \ + } \ + \ + float4 dst; \ + dst.x = local_data[local_id]; \ + dst.y = local_data[local_id + work_group_size]; \ + \ + write_imagef(output, coord.xy, dst.xxxx); \ + write_imagef(output, coord.zy, dst.yyyy); \ + \ + int4 index; \ + index.x = ((int*)local_indices)[local_id]; \ + index.y = ((int*)local_indices)[local_id + work_group_size]; \ + \ + write_imagei(indices, coord.xy, index.xxxx); \ + write_imagei(indices, coord.zy, index.yyyy); \ + } +TOPK_F32(1 << 0, 0) +TOPK_F32(1 << 1, 1) +TOPK_F32(1 << 2, 2) +TOPK_F32(1 << 3, 3) +TOPK_F32(1 << 4, 4) +TOPK_F32(1 << 5, 5) +TOPK_F32(1 << 6, 6) + +#define TOPK_U32(LOCAL_SIZE0, STAGES) \ +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + __write_only image2d_t indices, \ + int num_stages, \ + int width \ + ) \ + { \ + uint local_id = get_local_id(0); \ + uint work_group_size = get_local_size(0); \ + uint offset = 0; \ + \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + __local uint local_data[128]; \ + __local uint local_indices[128]; \ + \ + uint left = read_imageui(input, coord.xy).x; \ + coord.z += work_group_size; \ + uint data = read_imageui(input, coord.zy).x; \ + uint right = coord.z < width ? data : 0; \ + \ + local_data[local_id] = left; \ + local_indices[local_id] = local_id; \ + local_data[local_id + work_group_size] = right; \ + local_indices[local_id + work_group_size] = local_id + work_group_size; \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + for (uint stage = 0; stage < num_stages + 1; ++stage) \ + { \ + uint signo = (local_id >> stage) & 1; \ + \ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ + { \ + uint postShift = (stage - passOfStage); \ + uint pairDistance = 1 << postShift; \ + \ + uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \ + uint right_id = left_id + pairDistance; \ + \ + uint left_idx = local_indices[left_id]; \ + uint right_idx = local_indices[right_id]; \ + \ + uint left_elem = local_data[left_id]; \ + uint right_elem = local_data[right_id]; \ + \ + if ((left_elem < right_elem) ^ signo) \ + { \ + local_data[left_id] = right_elem; \ + local_data[right_id] = left_elem; \ + \ + local_indices[left_id] = right_idx; \ + local_indices[right_id] = left_idx; \ + } \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + } \ + } \ + \ + uint4 dst; \ + dst.x = local_data[local_id]; \ + dst.y = local_data[local_id + work_group_size]; \ + \ + write_imageui(output, coord.xy, dst.xxxx); \ + write_imageui(output, coord.zy, dst.yyyy); \ + \ + int4 index; \ + index.x = ((int*)local_indices)[local_id]; \ + index.y = ((int*)local_indices)[local_id + work_group_size]; \ + \ + write_imagei(indices, coord.xy, index.xxxx); \ + write_imagei(indices, coord.zy, index.yyyy); \ + } +TOPK_U32(1 << 0, 0) +TOPK_U32(1 << 1, 1) +TOPK_U32(1 << 2, 2) +TOPK_U32(1 << 3, 3) +TOPK_U32(1 << 4, 4) +TOPK_U32(1 << 5, 5) +TOPK_U32(1 << 6, 6) + +#define TOPK_I32(LOCAL_SIZE0, STAGES) \ +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + __write_only image2d_t indices, \ + int num_stages, \ + int width \ + ) \ + { \ + int local_id = get_local_id(0); \ + int work_group_size = get_local_size(0); \ + int offset = 0; \ + \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + __local int local_data[128]; \ + __local int local_indices[128]; \ + \ + int left = read_imagei(input, coord.xy).x; \ + coord.z += work_group_size; \ + int data = read_imagei(input, coord.zy).x; \ + int right = coord.z < width ? data : -2147483647; \ + \ + local_data[local_id] = left; \ + local_indices[local_id] = local_id; \ + local_data[local_id + work_group_size] = right; \ + local_indices[local_id + work_group_size] = local_id + work_group_size; \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + for (int stage = 0; stage < num_stages + 1; ++stage) \ + { \ + int signo = (local_id >> stage) & 1; \ + \ + for (int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ + { \ + int postShift = (stage - passOfStage); \ + int pairDistance = 1 << postShift; \ + \ + int left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \ + int right_id = left_id + pairDistance; \ + \ + int left_idx = local_indices[left_id]; \ + int right_idx = local_indices[right_id]; \ + \ + int left_elem = local_data[left_id]; \ + int right_elem = local_data[right_id]; \ + \ + if ((left_elem < right_elem) ^ signo) \ + { \ + local_data[left_id] = right_elem; \ + local_data[right_id] = left_elem; \ + \ + local_indices[left_id] = right_idx; \ + local_indices[right_id] = left_idx; \ + } \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + } \ + } \ + \ + int4 dst; \ + dst.x = local_data[local_id]; \ + dst.y = local_data[local_id + work_group_size]; \ + \ + write_imagei(output, coord.xy, dst.xxxx); \ + write_imagei(output, coord.zy, dst.yyyy); \ + \ + int4 index; \ + index.x = ((int*)local_indices)[local_id]; \ + index.y = ((int*)local_indices)[local_id + work_group_size]; \ + \ + write_imagei(indices, coord.xy, index.xxxx); \ + write_imagei(indices, coord.zy, index.yyyy); \ + } +TOPK_I32(1 << 0, 0) +TOPK_I32(1 << 1, 1) +TOPK_I32(1 << 2, 2) +TOPK_I32(1 << 3, 3) +TOPK_I32(1 << 4, 4) +TOPK_I32(1 << 5, 5) +TOPK_I32(1 << 6, 6) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx index ac867c9..540a834 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/argmax_axis2.vx @@ -3,6 +3,8 @@ _viv_uniform int4 packedArgIdx; _viv_uniform int argLenSub1; _viv_uniform VXC_512Bits uniExtractData_2x8; +_viv_uniform VXC_512Bits uniExtract1stU8toI16_2x8; +_viv_uniform VXC_512Bits uniExtract2ndU8toI16_2x8; #define TENSOR_ARGMAX_AXIS2_16BITS(src_type_name, dst_type_name,\ src_type, copy_type, axis_type, dst_type, inst_type) \ @@ -67,6 +69,56 @@ TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_ #define TENSOR_ARGMAX_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \ __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \ __read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \ + src_type src; \ + src_type maxVal; \ + VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + dst_type axis; \ + dst_type packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.z --; \ + do \ + { \ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.z --; \ + packIdx --; \ + maxVal = max(maxVal, src); \ + src_type condition; \ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + axis = condition ? packIdx : axis; \ + } while (coord.z >= 0); \ + \ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS2_8BITS(I8, U8, vxc_char16, vxc_uchar16) +TENSOR_ARGMAX_AXIS2_8BITS(U8, U8, vxc_uchar16, vxc_uchar16) + +#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output, \ + int axisVal \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8) +TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8) +TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8) +TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8) + +#define TENSOR_ARGMAX_AXIS2_MIX(src_type_name, dst_type_name, src_type, dst_type) \ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ __write_only image2d_array_t output, \ int axisVal \ ) \ @@ -95,23 +147,46 @@ __write_only image2d_array_t output, \ \ VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } -TENSOR_ARGMAX_AXIS2_8BITS(I8, I16, vxc_char8, vxc_short8) -TENSOR_ARGMAX_AXIS2_8BITS(I8, U8, vxc_char8, vxc_uchar8) -TENSOR_ARGMAX_AXIS2_8BITS(U8, I16, vxc_uchar8, vxc_short8) -TENSOR_ARGMAX_AXIS2_8BITS(U8, U8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMAX_AXIS2_MIX(I8, I16, vxc_char8, vxc_short8) +TENSOR_ARGMAX_AXIS2_MIX(U8, I16, vxc_uchar8, vxc_short8) -#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \ - __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \ +#define TENSOR_ARGMAX_AXIS2_MIX_OPT(src_type_name, dst_type_name, src_type, dst_type) \ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_opt( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ int axisVal \ ) \ { \ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ - dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \ - VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \ + src_type src; \ + src_type maxVal; \ + VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + vxc_uchar16 axis; \ + vxc_uchar16 packIdx; \ + \ + _viv_asm(COPY, axis, packedArgIdx, 16); \ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \ + \ + coord.z --; \ + do \ + { \ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.z --; \ + packIdx --; \ + maxVal = max(maxVal, src); \ + src_type condition; \ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \ + axis = condition ? packIdx : axis; \ + } while (coord.z >= 0); \ + vxc_short8 dst0, dst1; \ + VXC_DP2x8(dst0, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniExtract1stU8toI16_2x8); \ + VXC_DP2x8(dst1, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniExtract2ndU8toI16_2x8); \ + \ + VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + VXC_WriteImage(output, coord.xy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } -TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8) -TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8) -TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8) -TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8) +TENSOR_ARGMAX_AXIS2_MIX_OPT(I8, I16, vxc_char16, vxc_short8) +TENSOR_ARGMAX_AXIS2_MIX_OPT(U8, I16, vxc_uchar16, vxc_short8) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx similarity index 83% rename from src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx rename to src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx index 305f666..e3ca29e 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx @@ -19,14 +19,13 @@ __kernel void Softmax2VXC int axis ) { - int4 coord_in = (int4)(0,0,0,0); float fMax = 0.0; for (int i = 0; i < sf_size; i++) { vxc_char8 val; coord_in.x = i; - VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); float fval; VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); @@ -40,7 +39,7 @@ __kernel void Softmax2VXC vxc_char8 val; coord_in.x = i; - VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); float fval; VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); @@ -57,7 +56,7 @@ __kernel void Softmax2VXC vxc_short8 val; vxc_half8 val_h; coord_in.x = i; - VXC_ReadImage2DArray(val, output, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); float fval; _viv_asm(COPY, val_h,val, 16); VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); @@ -68,8 +67,4 @@ __kernel void Softmax2VXC _viv_asm(COPY,dst,hVal, 4); VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } - } - - - diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx new file mode 100644 index 0000000..cd9511b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx @@ -0,0 +1,353 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable + +#include "cl_viv_vx_ext.h" + +_viv_uniform float4 matrix0; +_viv_uniform float2 matrix1; +_viv_uniform float4 matrix4; +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in = convert_int4(coord_f); + + vxc_uchar16 dst; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_bilinear_U8toU8_2D +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in = convert_int4(coord_f); + + vxc_uchar16 src0, src1, dst; + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_nearest_neighbor_U8toU8 +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in = convert_int4(coord_f); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 dst; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_bilinear_U8toU8 +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in = convert_int4(coord_f); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 src0, src1, dst; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx new file mode 100644 index 0000000..3b3b3f1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_perspective.vx @@ -0,0 +1,395 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable + +#include "cl_viv_vx_ext.h" + +_viv_uniform float4 matrix0; +_viv_uniform float4 matrix1; +_viv_uniform float4 matrix2; +_viv_uniform float4 matrix4; +__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5, + float _m6, + float _m7, + float _m8 +) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f0 = convert_float4(coord_in); + + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx; + z0.zw = z0.zw + 2 * matrix1.z; + float4 z1 = z0 + 4 * matrix1.z; + + z0 = 1.0f / z0; + z1 = 1.0f / z1; + + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy; + float4 coord_f = coord_f0 * z0.xxyy; + + coord_in = convert_int4(coord_f); + + vxc_uchar16 dst; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z0.zzww; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z1.xxyy; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z1.zzww; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_perspective_bilinear_U8toU8_2D +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5, + float _m6, + float _m7, + float _m8 +) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f0 = convert_float4(coord_in); + + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx; + z0.zw = z0.zw + 2 * matrix1.z; + float4 z1 = z0 + 4 * matrix1.z; + + z0 = 1.0f / z0; + z1 = 1.0f / z1; + + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy; + float4 coord_f = coord_f0 * z0.xxyy; + + coord_in = convert_int4(floor(coord_f)); + + vxc_uchar16 src0, src1, dst; + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z0.zzww; + coord_in = convert_int4(floor(coord_f)); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z1.xxyy; + coord_in = convert_int4(floor(coord_f)); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z1.zzww; + coord_in = convert_int4(floor(coord_f)); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \ + VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0)); +__kernel void custom_warp_perspective_nearest_neighbor_U8toU8 +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5, + float _m6, + float _m7, + float _m8 +) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f0 = convert_float4(coord_in); + + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx; + z0.zw = z0.zw + 2 * matrix1.z; + float4 z1 = z0 + 4 * matrix1.z; + + z0 = 1.0f / z0; + z1 = 1.0f / z1; + + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy; + float4 coord_f = coord_f0 * z0.xxyy; + + coord_in = convert_int4(coord_f); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 dst; + IMAGE_LOAD_3D(dst, 0, 0, 0, 0) + coord_input.xy = coord_in.zw; + IMAGE_LOAD_3D(dst, 0, 0, 1, 1) + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z0.zzww; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + IMAGE_LOAD_3D(dst, 0, 0, 2, 2) + coord_input.xy = coord_in.zw; + IMAGE_LOAD_3D(dst, 0, 0, 3, 3) + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z1.xxyy; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + IMAGE_LOAD_3D(dst, 0, 0, 4, 4) + coord_input.xy = coord_in.zw; + IMAGE_LOAD_3D(dst, 0, 0, 5, 5) + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z1.zzww; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + IMAGE_LOAD_3D(dst, 0, 0, 6, 6) + coord_input.xy = coord_in.zw; + IMAGE_LOAD_3D(dst, 0, 0, 7, 7) + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_perspective_bilinear_U8toU8 +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5, + float _m6, + float _m7, + float _m8 +) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f0 = convert_float4(coord_in); + + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx; + z0.zw = z0.zw + 2 * matrix1.z; + float4 z1 = z0 + 4 * matrix1.z; + + z0 = 1.0f / z0; + z1 = 1.0f / z1; + + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy; + float4 coord_f = coord_f0 * z0.xxyy; + + coord_in = convert_int4(floor(coord_f)); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 src0, src1, dst; + IMAGE_LOAD_3D(src0, 0, 0, 0, 1) + IMAGE_LOAD_3D(src1, 0, 1, 0, 1) +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + IMAGE_LOAD_3D(src0, 0, 0, 0, 1) + IMAGE_LOAD_3D(src1, 0, 1, 0, 1) +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z0.zzww; + coord_in = convert_int4(floor(coord_f)); + coord_input.xy = coord_in.xy; + IMAGE_LOAD_3D(src0, 0, 0, 0, 1) + IMAGE_LOAD_3D(src1, 0, 1, 0, 1) +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + IMAGE_LOAD_3D(src0, 0, 0, 0, 1) + IMAGE_LOAD_3D(src1, 0, 1, 0, 1) +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z1.xxyy; + coord_in = convert_int4(floor(coord_f)); + coord_input.xy = coord_in.xy; + IMAGE_LOAD_3D(src0, 0, 0, 0, 1) + IMAGE_LOAD_3D(src1, 0, 1, 0, 1) +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + IMAGE_LOAD_3D(src0, 0, 0, 0, 1) + IMAGE_LOAD_3D(src1, 0, 1, 0, 1) +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f0 = coord_f0.zwzw + matrix4; + coord_f = coord_f0 * z1.zzww; + coord_in = convert_int4(floor(coord_f)); + coord_input.xy = coord_in.xy; + IMAGE_LOAD_3D(src0, 0, 0, 0, 1) + IMAGE_LOAD_3D(src1, 0, 1, 0, 1) +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + IMAGE_LOAD_3D(src0, 0, 0, 0, 1) + IMAGE_LOAD_3D(src1, 0, 1, 0, 1) +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx index a5612b4..601ebfd 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx @@ -304,4 +304,4 @@ __kernel void depth2space_crd_F16toI16_blk2( VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_out.x += 8; VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx index 8a56bb3..086e399 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx @@ -8,6 +8,11 @@ float4 eltwise_unary_sin(float4 x) return native_sin(x); } +float4 eltwise_unary_cos(float4 x) +{ + return native_cos(x); +} + #define logE (1.44269502f) #define twoLogE (logE * 2.0f) float4 eltwise_unary_exp(float4 x) @@ -189,6 +194,17 @@ ELTSISE_UNARY_2D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_u ELTSISE_UNARY_2D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//COS +ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) //LOG ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) ELTSISE_UNARY_2D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) @@ -315,6 +331,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8; ELTSISE_UNARY_BF16_2D(exp) //SIN ELTSISE_UNARY_BF16_2D(sin) +//COS +ELTSISE_UNARY_BF16_2D(cos) //LOG ELTSISE_UNARY_BF16_2D(log) //ELU diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx index 3faa1f5..a7ba363 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx @@ -8,6 +8,11 @@ float4 eltwise_unary_sin(float4 x) return native_sin(x); } +float4 eltwise_unary_cos(float4 x) +{ + return native_cos(x); +} + #define logE (1.44269502f) #define twoLogE (logE * 2.0f) float4 eltwise_unary_exp(float4 x) @@ -189,6 +194,17 @@ ELTSISE_UNARY_3D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_u ELTSISE_UNARY_3D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//COS +ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) //LOG ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) ELTSISE_UNARY_3D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) @@ -314,6 +330,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8; ELTSISE_UNARY_BF16(exp) //SIN ELTSISE_UNARY_BF16(sin) +//COS +ELTSISE_UNARY_BF16(cos) //LOG ELTSISE_UNARY_BF16(log) //ELU diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx index 90b5135..3a1661e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx @@ -91,8 +91,6 @@ __kernel void gather_F16toF16( int gidz = get_global_id(2); // block_num int4 coord_in = (int4)(gidy, 0, gidx, 0); - - int4 indice = read_imagei(input1, coord_in.xy); coord_in.w = gidz * axis_num + indice.x; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx new file mode 100644 index 0000000..8d09d50 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx @@ -0,0 +1,237 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int indices_num; +_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8; +_viv_uniform int batch; + +__kernel void gather_batch_I8toI8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int2 coord_idx = (int2)(gidy, 0); + int4 coord_in = (int4)(gidx, 0, 0, 0); + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); + for(; coord_idx.y < batch;) + { + int4 indice = read_imagei(input1, coord_idx); + coord_idx.y++; + coord_in.y = gidz * axis_num + indice.x; + + vxc_char16 src; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.z++; + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.z++; + } +} + +__kernel void gather_batch_U8toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int2 coord_idx = (int2)(gidy, 0); + int4 coord_in = (int4)(gidx, 0, 0, 0); + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); + for(; coord_idx.y < batch;) + { + int4 indice = read_imagei(input1, coord_idx); + coord_idx.y++; + coord_in.y = gidz * axis_num + indice.x; + + vxc_uchar16 src; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.z++; + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.z++; + } +} + +__kernel void gather_batch_I16toI16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int2 coord_idx = (int2)(gidy, 0); + int4 coord_in = (int4)(gidx, 0, 0, 0); + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); + for(; coord_idx.y < batch;) + { + int4 indice = read_imagei(input1, coord_idx); + coord_idx.y++; + coord_in.y = gidz * axis_num + indice.x; + + vxc_short8 src; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.z++; + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.z++; + } +} + +__kernel void gather_batch_F16toF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int2 coord_idx = (int2)(gidy, 0); + int4 coord_in = (int4)(gidx, 0, 0, 0); + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); + for(; coord_idx.y < batch;) + { + int4 indice = read_imagei(input1, coord_idx); + coord_idx.y++; + coord_in.y = gidz * axis_num + indice.x; + + vxc_short8 src; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.z++; + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.z++; + } +} + +__kernel void gather_batch_I8toI8_axis0( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 indices = read_imagei(input1, coord.xz); + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); + + vxc_char16 src, dst; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.y; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.z; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.w; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtraCopyDpKeepinEvis_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_batch_U8toU8_axis0( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 indices = read_imagei(input1, coord.xz); + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); + + vxc_uchar16 src, dst; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.y; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.z; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.w; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtraCopyDpKeepinEvis_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_batch_I16toI16_axis0( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 indices = read_imagei(input1, coord.xz); + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.y; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.z; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.w; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtraCopyDpKeepinEvis_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void gather_batch_F16toF16_axis0( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 indices = read_imagei(input1, coord.xz); + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.y; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.z; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.w; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtraCopyDpKeepinEvis_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx new file mode 100644 index 0000000..0e94445 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx @@ -0,0 +1,236 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int indices_num; +_viv_uniform int batch; + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8; +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp + +#define GATHER_BATCH_8BITS_TO_F16(src0_type_name, read_type) \ +__kernel void gather_batch_##src0_type_name##toF16( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int block_num, \ + int axis_num \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + \ + int2 coord_idx = (int2)(gidy, 0); \ + int4 coord_in = (int4)(gidx, 0, 0, 0); \ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + \ + for(; coord_idx.y < batch;) \ + { \ + int4 indice = read_imagei(input1, coord_idx); \ + coord_idx.y++; \ + coord_in.y = gidz * axis_num + indice.x; \ + \ + read_type src; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z++; \ + vxc_half8 src0, src1; \ + vxc_short8 dst0, dst1; \ + \ + VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_DP2x8(src1,src,ms0, VXC_MODIFIER(0,7,8, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + _viv_asm(COPY, dst1, src1, 16); \ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + VXC_WriteImage2DArray(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z++; \ + coord.x = gidx; \ + } \ +} +GATHER_BATCH_8BITS_TO_F16(U8, vxc_uchar16) +GATHER_BATCH_8BITS_TO_F16(I8, vxc_char16) + +#define GATHER_BATCH_F16_TO_QINT(src1_type_name, write_type) \ +__kernel void gather_batch_F16to##src1_type_name( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int block_num, \ + int axis_num \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + \ + int2 coord_idx = (int2)(gidy, 0); \ + int4 coord_in = (int4)(gidx, 0, 0, 0); \ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + for(; coord_idx.y < batch;) \ + { \ + int4 indice = read_imagei(input1, coord_idx); \ + coord_idx.y++; \ + coord_in.y = gidz * axis_num + indice.x; \ + \ + vxc_short8 src; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z++; \ + \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z++; \ + } \ +} +GATHER_BATCH_F16_TO_QINT(U8, vxc_uchar16) +GATHER_BATCH_F16_TO_QINT(I8, vxc_char16) +GATHER_BATCH_F16_TO_QINT(I16, vxc_short8) + +__kernel void gather_batch_I16toF16( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + + int2 coord_idx = (int2)(gidy, 0); + int4 coord_in = (int4)(gidx, 0, 0, 0); + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); + vxc_half8 src0; + vxc_short8 dst0; + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + for(; coord_idx.y < batch;) + { + int4 indice = read_imagei(input1, coord_idx); + coord_idx.y++; + coord_in.y = gidz * axis_num + indice.x; + + vxc_short8 src; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.z++; + VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + _viv_asm(COPY, dst0, src0, 16); + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.z++; + } +} + +#define GATHER_BATCH_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \ +__kernel void gather_batch_##src0_type_name##toF16_axis0( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int block_num, \ + int axis_num \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 indices = read_imagei(input1, coord.xz); \ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \ + \ + read_type src; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = indices.y; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = indices.z; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = indices.w; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_half8 src0; \ + vxc_short8 dst0; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_BATCH_8BITS_TO_F16_AXIS0(U8, vxc_uchar16) +GATHER_BATCH_8BITS_TO_F16_AXIS0(I8, vxc_char16) + +#define GATHER_BATCH_F16_TO_QINT_AXIS0(src1_type_name, write_type) \ +__kernel void gather_batch_F16to##src1_type_name##_axis0( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int block_num, \ + int axis_num \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 indices = read_imagei(input1, coord.xz); \ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \ + \ + vxc_short8 src; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = indices.y; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = indices.z; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = indices.w; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GATHER_BATCH_F16_TO_QINT_AXIS0(U8, vxc_uchar16) +GATHER_BATCH_F16_TO_QINT_AXIS0(I8, vxc_char16) +GATHER_BATCH_F16_TO_QINT_AXIS0(I16, vxc_short8) + +__kernel void gather_batch_I16toF16_axis0( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 indices = read_imagei(input1, coord.xz); + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.y; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.z; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_in.x = indices.w; + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + + vxc_half8 src0; + vxc_short8 dst0; + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniU8MulAndPostShift_0_Lo_2x8); + _viv_asm(COPY, dst0, src0, 16); + + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx b/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx index 4ba7c40..dceb404 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/logical_ops.vx @@ -1,5 +1,7 @@ #include "cl_viv_vx_ext.h" +_viv_uniform VXC_512Bits uniConvertInt16toInt8_2x8; + #define TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\ lgc_op, lgc_op2, read_fun, write_fun) \ input_type vA;\ @@ -59,7 +61,7 @@ out_copy_type, lgc_op, lgc_op2, read_fun, write_fun) \ VXC_DP2x8(tmpOut,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 0),uniMulShortMinus1toFp16_2x8); \ out_copy_type data; \ _viv_asm(COPY, data, tmpOut, 16); \ - write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); #define TENSORLOGICAL_FP(name0, src_type_name, dst_type_name, input_type,\ @@ -86,6 +88,47 @@ copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \ VXC_ReadImage, VXC_WriteImage) \ } +#define TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type,\ +out_copy_type, lgc_op, lgc_op2, read_fun, write_fun) \ + input_type vA;\ + copy_type src0;\ + input_type vB;\ + copy_type src1;\ + read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\ + _viv_asm(COPY, src0, vA, 16); \ + read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\ + _viv_asm(COPY, src1, vB, 16); \ + output_type dst; \ + dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \ + vxc_char8 data; \ + VXC_DP2x8(data,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 1),uniConvertInt16toInt8_2x8); \ + data &= 1; \ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +#define TENSORLOGICAL_BFP16(name0, src_type_name, dst_type_name, input_type,\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name( \ + __read_only image2d_array_t in0, \ + __read_only image2d_array_t in1, \ + __write_only image2d_array_t output) \ +{\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\ + TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\ + VXC_ReadImage2DArray, VXC_WriteImage2DArray) \ +} + +#define TENSORLOGICAL_BFP16_2D(name0, src_type_name, dst_type_name, input_type,\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \ + __read_only image2d_array_t in0, \ + __read_only image2d_array_t in1, \ + __write_only image2d_array_t output) \ +{\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\ + TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\ + VXC_ReadImage, VXC_WriteImage) \ +} + // name0, src_name, dst_name, input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2 TENSORLOGICAL(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, ) //TENSORLOGICAL(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, ) @@ -100,6 +143,10 @@ TENSORLOGICAL(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8, vx //TENSORLOGICAL(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) //TENSORLOGICAL_FP(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) +TENSORLOGICAL_BFP16(or, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, ) +TENSORLOGICAL_BFP16(and, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, ) +TENSORLOGICAL_BFP16(xor, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) + TENSORLOGICAL_2D(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, ) //TENSORLOGICAL_2D(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, ) //TENSORLOGICAL_2D(or, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, ) @@ -112,3 +159,7 @@ TENSORLOGICAL_2D(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8, //TENSORLOGICAL_2D(xor, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ^, !!) //TENSORLOGICAL_2D(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) //TENSORLOGICAL_FP_2D(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) + +TENSORLOGICAL_BFP16_2D(or, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, ) +TENSORLOGICAL_BFP16_2D(and, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, ) +TENSORLOGICAL_BFP16_2D(xor, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx new file mode 100644 index 0000000..433dc4f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx @@ -0,0 +1,272 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +__kernel void gemm_BF16BF16toBF16(image2d_array_t inputA, + image2d_array_t inputB, image2d_array_t output, + int transposeA, int transposeB, + int adjointA, int adjointB, uint M, uint K, uint N) +{ + uint gidy = get_global_id(1); + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_ushort8 valC0, valC1, src0, src1; + vxc_ushort8 srcA0, srcB0, srcA1, srcB1, outC; + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) + { + vxc_float4 tempA0, tempA1, tempA2, tempA3; + vxc_float4 tempB0, tempB1, tempB2, tempB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 4; + coord_b.y += 4; + + VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempA0, src0, 16); + VXC_DP2x8(src1, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, tempA1, src1, 16); + VXC_DP2x8(src0, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempA2, src0, 16); + VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, tempA3, src1, 16); + VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempB0, src0, 16); + VXC_DP2x8(src1, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, tempB1, src1, 16); + VXC_DP2x8(src0, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempB2, src0, 16); + VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, tempB3, src1, 16); + + sum0 += (tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); + sum1 += (tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); + sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); + sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); + } + coord_b.y = gidy; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr); + _viv_asm(COPY, valC0, sum0, 16); + _viv_asm(COPY, valC1, sum1, 16); + VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(COPY, valC0, sum2, 16); + _viv_asm(COPY, valC1, sum3, 16); + VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void gemm_transa_BF16BF16toBF16( + image2d_array_t inputA, + image2d_array_t inputB, + image2d_array_t output, + int transposeA, + int transposeB, + int adjointA, + int adjointB, + uint M, uint K, uint N) +{ + uint gidy = get_global_id(1); + + vxc_ushort8 valC0, valC1; + vxc_ushort8 srcA, srcB, outC, src0, src1; + + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + vxc_float4 tempA0; + vxc_float4 tempB0; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;) + { + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_a.y++; + coord_b.y++; + + VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempA0, src0, 16); + + VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempB0, src1, 16); + + sum0 = (sum0 + tempA0.x * tempB0); + sum1 = (sum1 + tempA0.y * tempB0); + sum2 = (sum2 + tempA0.z * tempB0); + sum3 = (sum3 + tempA0.w * tempB0); + } + coord_b.y = gidy; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr); + _viv_asm(COPY, valC0, sum0, 16); + _viv_asm(COPY, valC1, sum1, 16); + VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_b.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_b.y++; + _viv_asm(COPY, valC0, sum2, 16); + _viv_asm(COPY, valC1, sum3, 16); + VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + coord_b.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA, + image2d_array_t inputB, + image2d_array_t output, + int transposeA, + int transposeB, + int adjointA, + int adjointB, + uint M, uint K, uint N) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); + + vxc_float4 sum0 = (vxc_float4)(0); + vxc_float4 sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0); + vxc_float4 sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_ushort8 src0, src1; + + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;) + { + vxc_ushort8 srcA0,srcA1,srcA2,srcA3; + vxc_ushort8 srcB0,srcB1,srcB2,srcB3; + vxc_float4 tempA0, tempA1, tempA2, tempA3; + vxc_float4 tempB0, tempB1, tempB2, tempB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 4; + coord_b.x += 4; + + VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempA0, src0, 16); + VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempA1, src1, 16); + VXC_DP2x8(src0, srcA2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempA2, src0, 16); + VXC_DP2x8(src1, srcA3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempA3, src1, 16); + + VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempB0, src0, 16); + VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempB1, src1, 16); + VXC_DP2x8(src0, srcB2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempB2, src0, 16); + VXC_DP2x8(src1, srcB3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, tempB3, src1, 16); + + sum0 += (float4)(dot(tempA0, tempB0), dot(tempA0, tempB1), dot(tempA0, tempB2), dot(tempA0, tempB3)); + sum1 += (float4)(dot(tempA1, tempB0), dot(tempA1, tempB1), dot(tempA1, tempB2), dot(tempA1, tempB3)); + sum2 += (float4)(dot(tempA2, tempB0), dot(tempA2, tempB1), dot(tempA2, tempB2), dot(tempA2, tempB3)); + sum3 += (float4)(dot(tempA3, tempB0), dot(tempA3, tempB1), dot(tempA3, tempB2), dot(tempA3, tempB3)); + } + + vxc_ushort8 valC0, valC1, valDst; + _viv_asm(COPY, valC0, sum0, 16); + _viv_asm(COPY, valC1, sum1, 16); + VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + _viv_asm(COPY, valC0, sum2, 16); + _viv_asm(COPY, valC1, sum3, 16); + VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx index 9d2ef89..bd211d4 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx @@ -11,6 +11,9 @@ _viv_uniform int ac2zero; _viv_uniform int bc2zero; _viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; #if (VX_VERSION==2) __kernel void gemm_F16F16toF16(image2d_array_t inputA, @@ -192,14 +195,9 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA, } #endif -__kernel void gemm_F32F32toF32(image2d_array_t inputA, - image2d_array_t inputB, - image2d_array_t output, - int transposeA, - int transposeB, - int adjointA, - int adjointB, - uint M, uint K, uint N) +__kernel void gemm_F32F32toF32( + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) { uint gidx = get_global_id(0); uint gidy = get_global_id(1); @@ -207,10 +205,8 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA, int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0); - vxc_float4 sum0 = (vxc_float4)(0); - vxc_float4 sum1 = (vxc_float4)(0); - vxc_float4 sum2 = (vxc_float4)(0); - vxc_float4 sum3 = (vxc_float4)(0); + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); vxc_int4 tmpOut0, tmpOut1; vxc_uchar16 outC; @@ -224,7 +220,6 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA, coord_a.x = i; coord_a.y = gidy; - coord_b.x = gidx; coord_b.y = i; @@ -257,4 +252,4 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA, write_imagef(output, coord_b, sum2); coord_b.y++; write_imagef(output, coord_b, sum3); -} \ No newline at end of file +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx index c40c720..cb7c067 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum.vx @@ -222,6 +222,62 @@ __kernel void maximum_U8U8toU8_2D VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); } +__kernel void maximum_U8U8toI16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar16 src0, src1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_short8 dst0, dst1, dst; + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + dst = max(dst0, dst1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_U8U8toI16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar16 src0, src1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_short8 dst0, dst1, dst; + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + dst = max(dst0, dst1); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8; _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8; __kernel void maximum_I16I16toI16 diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx index 15ab020..aab5d72 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/maximum_i16.vx @@ -170,4 +170,64 @@ __kernel void maximum_F16F16toI16_2D tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp); VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8; +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp +__kernel void maximum_I16I16toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 dst0, dst1, dst; + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + dst = max(dst0, dst1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void maximum_I16I16toU8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 src0, src1; + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 dst0, dst1, dst; + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + dst = max(dst0, dst1); + + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx index 4bfe529..0b3ef97 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum.vx @@ -224,6 +224,62 @@ __kernel void minimum_U8U8toU8_2D VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); } +__kernel void minimum_U8U8toI16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar16 src0, src1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_short8 dst0, dst1, dst; + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + dst = min(dst0, dst1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_U8U8toI16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_uchar16 src0, src1; + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_short8 dst0, dst1, dst; + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + dst = min(dst0, dst1); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8; _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8; __kernel void minimum_I16I16toI16 diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx index a314ca9..c2f5ca5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/minimum_i16.vx @@ -173,5 +173,65 @@ __kernel void minimum_F16F16toI16_2D tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp); VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8; +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp +__kernel void minimum_I16I16toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src0, src1; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 dst0, dst1, dst; + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + dst = min(dst0, dst1); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void minimum_I16I16toU8_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_short8 src0, src1; + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_uchar16 dst0, dst1, dst; + vxc_ushort8 mp0, mp1; + _viv_asm(COPY, mp0, multAndoutZP0, 16); + _viv_asm(COPY, mp1, multAndoutZP1, 16); + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift0_Lo_2x8); + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\ + uniU8MulAndPostShift1_Lo_2x8); + dst = min(dst0, dst1); + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx index 3d8dd53..2652b0f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis0.vx @@ -17,6 +17,9 @@ _viv_uniform float e2InScale; _viv_uniform float rowSumScale; _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; #define MOMENTS_AXIS0_QINT(src0_type_name, read0_type) \ __kernel void moments_axis0_##src0_type_name##toF16( \ @@ -262,6 +265,88 @@ __kernel void moments_axis0_I16toF16_2D( VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void moments_axis0_BF16toBF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidy = get_global_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(0, gidy, gidz, 0); + vxc_ushort8 src0, src1; + vxc_ushort8 val; + vxc_float4 mean_vari0 = (vxc_float4)(0); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f); + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 vec0, vec1; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, vec1, src1, 16); + mean_vari0.x += dot(vec0, one) + dot(vec1, one); + mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1); + } + + mean_vari0 *= dimRatio; + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; + + int2 coord_out = (int2)(gidy, gidz); + + vxc_short8 dst; + _viv_asm(COPY, src0, mean_vari0, 16); + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void moments_axis0_BF16toBF16_2D( + image2d_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidy = get_global_id(0); + int2 coord = (int2)(0, gidy); + vxc_ushort8 src0, src1; + vxc_ushort8 val; + vxc_float4 mean_vari0 = (vxc_float4)(0); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f); + + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 vec0, vec1; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, vec1, src1, 16); + mean_vari0.x += dot(vec0, one) + dot(vec1, one); + mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1); + } + mean_vari0 *= dimRatio; + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0; + + int2 coord_out = (int2)(gidy, 0); + + vxc_short8 dst; + _viv_asm(COPY, src0, mean_vari0, 16); + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx index 6afb0a5..617719e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis012.vx @@ -18,6 +18,9 @@ _viv_uniform float e2InScale; _viv_uniform float rowSumScale; _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; #define MOMENTS_AXIS012_QINT(src0_type_name, read0_type) \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toF16( \ @@ -236,4 +239,79 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_I1 VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_BF16toBF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, + int axis_num) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int4 coord = (int4)(gidx, 0, 0, 0); + vxc_float4 sumsqr; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + float tmpSum = 0; + float tmpSqr = 0; + vxc_ushort8 src0, src1; + vxc_ushort8 val; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f); + + for(coord.z = 0; coord.z < channel; coord.z++) + { + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + float4 vec0, vec1; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, vec1, src1, 16); + tmpSum += dot(vec0, one) + dot(vec1, one); + tmpSqr += dot(vec0, vec0) + dot(vec1, vec1); + } + } + } + lcl_sum[lidx] = tmpSum; + lcl_sqr[lidx] = tmpSqr; + + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(0, 0); + if(lidx == 0) + { + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = (float)(0); + float sqr = (float)(0); + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean_vari; + mean_vari.x = sum * dimRatio; + mean_vari.y = sqr * dimRatio; + mean_vari.y = mean_vari.y - mean_vari.x * mean_vari.x; + + vxc_short8 dst; + _viv_asm(COPY, src0, mean_vari, 16); + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx index 0be50bf..d303ed9 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis1.vx @@ -10,6 +10,8 @@ _viv_uniform float e2InScale; _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; #define MOMENTS_AXIS1_QINT(src0_type_name, read0_type) \ __kernel void moments_axis1_##src0_type_name##toF16( \ @@ -197,3 +199,85 @@ __kernel void moments_axis1_F16toF16_2D( VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); } + +__kernel void moments_axis1_BF16toBF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_ushort8 src0; + vxc_ushort8 val; + vxc_float4 sum = (vxc_float4)(0); + vxc_float4 sqr = (vxc_float4)(0); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + float4 vec0; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + + sum += vec0; + sqr += (vec0 * vec0); + } + + vxc_float4 mean = sum * dimRatio; + vxc_float4 vari = sqr * dimRatio; + vari = vari - mean * mean; + + int2 coord_out = (int2)(gidx, gidz); + vxc_short8 tmpdst0, tmpdst1, dst; + _viv_asm(COPY, tmpdst0, mean, 16); + _viv_asm(COPY, tmpdst1, vari, 16); + VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void moments_axis1_BF16toBF16_2D( + image2d_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, 0); + vxc_ushort8 src0; + vxc_ushort8 val; + vxc_float4 sum = (vxc_float4)(0); + vxc_float4 sqr = (vxc_float4)(0); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + float4 vec0; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + + sum += vec0; + sqr += (vec0 * vec0); + } + + vxc_float4 mean = sum * dimRatio; + vxc_float4 vari = sqr * dimRatio; + vari = vari - mean * mean; + + int2 coord_out = (int2)(gidx, 0); + vxc_short8 tmpdst0, tmpdst1, dst; + _viv_asm(COPY, tmpdst0, mean, 16); + _viv_asm(COPY, tmpdst1, vari, 16); + VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx index c47c34f..ce473c0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_axis2.vx @@ -9,6 +9,8 @@ _viv_uniform float e2InScale; _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; #define MOMENTS_AXIS2_QINT(src0_type_name, read0_type) \ __kernel void moments_axis2_##src0_type_name##toF16( \ @@ -95,6 +97,50 @@ __kernel void moments_axis2_F16toF16( _viv_asm(CONV, tmpVari, vari); VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void moments_axis2_BF16toBF16( + image2d_array_t input, + image2d_t output_mean, + image2d_t output_vari, + int axis, + int axis_num) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord = (int4)(gidx, gidy, 0, 0); + vxc_ushort8 src0; + vxc_ushort8 val; + vxc_float4 sum = (vxc_float4)(0); + vxc_float4 sqr = (vxc_float4)(0); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + + for(coord.z = 0; coord.z < channel; coord.z++) + { + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + float4 vec0; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + + sum += vec0; + sqr += (vec0 * vec0); + } + + vxc_float4 mean = sum * dimRatio; + vxc_float4 vari = sqr * dimRatio; + vari = vari - mean * mean; + + int2 coord_out = (int2)(gidx, gidy); + + vxc_short8 tmpdst0, tmpdst1, dst; + _viv_asm(COPY, tmpdst0, mean, 16); + _viv_asm(COPY, tmpdst1, vari, 16); + VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx b/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx index b456ee6..073c237 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/moments_u8_axis012.vx @@ -15,6 +15,9 @@ _viv_uniform float rowSumScale; _viv_uniform float4 output_ZP; _viv_uniform float4 outputScale; _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; #define MOMENTS_AXIS012_QINT_U8(src0_type_name, read0_type) \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toU8( \ @@ -72,4 +75,141 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_## VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ } \ } -MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16) \ No newline at end of file +MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16) + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF16toBF16( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + float tmpSum = 0; + float tmpSqr = 0; + vxc_ushort8 src0, src1; + vxc_ushort8 val; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f); + + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + float4 vec0, vec1; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, vec1, src1, 16); + tmpSum += dot(vec0, one) + dot(vec1, one); + tmpSqr += dot(vec0, vec0) + dot(vec1, vec1); + } + } + + lcl_sum[lidx] = tmpSum; + lcl_sqr[lidx] = tmpSqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(gidz, 0); + if(lidx == 0) + { + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0.0f; + float sqr = 0.0f; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 mean_vari; + mean_vari.x = sum * dimRatio; + mean_vari.y = sqr * dimRatio; + mean_vari.y = mean_vari.y - mean_vari.x * mean_vari.x; + + vxc_short8 dst; + _viv_asm(COPY, src0, mean_vari, 16); + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF16toBF16_2D( + image2d_array_t input, image2d_t output_mean, image2d_t output_vari, + int axis, int axis_num) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int2 coord = (int2)(gidx, 0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + float tmpSum = 0; + float tmpSqr = 0; + vxc_ushort8 src0, src1; + vxc_ushort8 val; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f); + + for(coord.x = gidx; coord.x < width; coord.x += 128) + { + for(coord.y = 0; coord.y < height;) + { + VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + + float4 vec0, vec1; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, vec1, src1, 16); + tmpSum += dot(vec0, one) + dot(vec1, one); + tmpSqr += dot(vec0, vec0) + dot(vec1, vec1); + } + } + + lcl_sum[lidx] = tmpSum; + lcl_sqr[lidx] = tmpSqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int2 coord_out = (int2)(0, 0); + if(lidx == 0) + { + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0.0f; + float sqr = 0.0f; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 mean_vari; + mean_vari.x = sum * dimRatio; + mean_vari.y = sqr * dimRatio; + mean_vari.y = mean_vari.y - mean_vari.x * mean_vari.x; + + vxc_short8 dst; + _viv_asm(COPY, src0, mean_vari, 16); + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx index 6d3cd52..eb248fb 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx @@ -203,3 +203,91 @@ __kernel void one_hot_##name0##to##name1##_2D \ ONE_HOT_ASYM_SH_IMPL_2D(U8, F16, vxc_uchar8, vxc_uchar8, vxc_ushort8) ONE_HOT_ASYM_SH_IMPL_2D(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +__kernel void one_hot_BF16toBF16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int suffix_sz, + int on_val, + int off_val + ) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0)); + + vxc_ushort8 src0, src1; + vxc_ushort8 val; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 vec0, vec1; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, vec1, src1, 16); + int4 data0 = convert_int4(vec0); + int4 data1 = convert_int4(vec1); + + do + { + int4 d0 = data0 == coord.zzzz ? on_val : off_val; + int4 d1 = data1 == coord.zzzz ? on_val : off_val; + + vxc_short8 dst; + VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + + VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + + coord.z ++; + } while (coord.z < depth); +} + +__kernel void one_hot_BF16toBF16_2D + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int suffix_sz, + int on_val, + int off_val + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + vxc_ushort8 src0, src1; + vxc_ushort8 val; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 vec0; + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + int4 data = convert_int4(vec0); + int4 data0, data1; + int4 d4 = (int4)(0, 1, 2, 3); + do + { + coord.zw = coord.xx + (int2)(0, 1); + vxc_short8 dst; + data0 = data.xxxx == d4 ? on_val : off_val; + data1 = data.yyyy == d4 ? on_val : off_val; + + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); + coord.zw = coord.zw + (int2)(2, 2); + + data0 = data.zzzz == d4 ? on_val : off_val; + data1 = data.wwww == d4 ? on_val : off_val; + + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); + d4 += 4; + coord.y += 4; + } while (coord.y < depth); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx index 64018e7..752813e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx @@ -7,13 +7,15 @@ _viv_uniform float input1Tail; _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4; _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; #define COMPARISONS_2D(func_name, src0_type_name, src1_type_name, \ src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \ - __read_only image2d_array_t input0, \ - __read_only image2d_array_t input1, \ - __write_only image2d_array_t output \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ @@ -112,3 +114,42 @@ COMPARISONS_2D(not_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half COMPARISONS_2D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=) COMPARISONS_2D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, !=) +#define COMPARISONS_BF_2D(func_name, src0_type_name, src1_type_name, cmp_op) \ +__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_ushort8 src0, src1, srcA, srcB; \ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 vecA0, vecA1; \ + float4 vecB0, vecB1; \ + VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA0, src0, 16); \ + VXC_DP2x8(src1, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, vecA1, src1, 16); \ + VXC_DP2x8(src0, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecB0, src0, 16); \ + VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, vecB1, src1, 16); \ + int4 dst0, dst1; \ + dst0 = (vecA0)cmp_op(vecB0); \ + dst1 = (vecA1)cmp_op(vecB1); \ + \ + vxc_char16 dst; \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst &= 1; \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} + +COMPARISONS_BF_2D(less, BF16, BF16, <) +COMPARISONS_BF_2D(great, BF16, BF16, >) +COMPARISONS_BF_2D(less_equal, BF16, BF16, <=) +COMPARISONS_BF_2D(great_equal, BF16, BF16, >=) +COMPARISONS_BF_2D(equal, BF16, BF16, ==) +COMPARISONS_BF_2D(not_equal, BF16, BF16, !=) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx index 0fcc274..f24a924 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx @@ -7,6 +7,8 @@ _viv_uniform float input1Tail; _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4; _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; #define COMPARISONS_3D(func_name, src0_type_name, src1_type_name, \ src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \ @@ -112,3 +114,42 @@ COMPARISONS_3D(not_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half COMPARISONS_3D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=) COMPARISONS_3D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, !=) +#define COMPARISONS_BF_3D(func_name, src0_type_name, src1_type_name, cmp_op) \ +__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_3D( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + vxc_ushort8 src0, src1, srcA, srcB; \ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 vecA0, vecA1; \ + float4 vecB0, vecB1; \ + VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA0, src0, 16); \ + VXC_DP2x8(src1, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, vecA1, src1, 16); \ + VXC_DP2x8(src0, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecB0, src0, 16); \ + VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, vecB1, src1, 16); \ + int4 dst0, dst1; \ + dst0 = (vecA0)cmp_op(vecB0); \ + dst1 = (vecA1)cmp_op(vecB1); \ + \ + vxc_char16 dst; \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst &= 1; \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} + +COMPARISONS_BF_3D(less, BF16, BF16, <) +COMPARISONS_BF_3D(great, BF16, BF16, >) +COMPARISONS_BF_3D(less_equal, BF16, BF16, <=) +COMPARISONS_BF_3D(great_equal, BF16, BF16, >=) +COMPARISONS_BF_3D(equal, BF16, BF16, ==) +COMPARISONS_BF_3D(not_equal, BF16, BF16, !=) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_1.vx similarity index 100% rename from src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx rename to src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_1.vx diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_2.vx new file mode 100644 index 0000000..ecf26d6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_2.vx @@ -0,0 +1,129 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int out_height; +_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8; +__kernel void resize_bilinear_U8toU8_SAME_8x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0); + coord_in.x = (coord_out.x * 2 - 7) >> 4; + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; + + vxc_uchar16 in0, in1, in2, tmp, dst0, dst1, dst2, dst3; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + while (coord_out.y < out_height) + { + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8); + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8); + VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8); + VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8); + VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8); + VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8); + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8); + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8); + VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8); + VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8); + VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8); + VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8); + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8); + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8); + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8); + VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8); + VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8); + VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8); + VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8); + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.y += 2; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8); + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8); + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8); + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8); + VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8); + VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8); + VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8); + VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx index e02967d..994aadd 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd.vx @@ -7,10 +7,14 @@ _viv_uniform int offsetX; _viv_uniform int offsetY; _viv_uniform int offsetZ; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + __kernel void scatter_nd_F16toF16( __read_only image2d_t input0, __read_only image2d_t input1, - image2d_array_t output, + image2d_t output, int width, int area, int coord_dim @@ -38,11 +42,53 @@ __kernel void scatter_nd_F16toF16( VXC_WriteImage(output, (int2)(gidx, gidy), tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } +__kernel void scatter_nd_BF16toBF16( + __read_only image2d_t input0, + __read_only image2d_t input1, + image2d_t output, + int width, + int area, + int coord_dim + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float4 sum0 = (float4)(0); + float4 sum1 = (float4)(0); + vxc_ushort8 tmpVal; + for(int i = 0; i < index_num; i++) + { + int4 indice = read_imagei(input0, (int2)(0, i)); + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ; + if(gidy == idx) + { + vxc_ushort8 src0, src1; + VXC_ReadImage(tmpVal, input1, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 vec0, vec1; + VXC_DP2x8(src0, tmpVal, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, vec0, src0, 16); + VXC_DP2x8(src1, tmpVal, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, vec1, src1, 16); + sum0 += vec0; + sum1 += vec1; + } + } + vxc_ushort8 dst0, dst1, dst; + _viv_asm(COPY, dst0, sum0, 16); + _viv_asm(COPY, dst1, sum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, (int2)(gidx, gidy), dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + #define SCATTER_ND_QINT(src0_type_name, data_type) \ __kernel void scatter_nd_##src0_type_name##to##src0_type_name##( \ __read_only image2d_t input0, \ __read_only image2d_t input1, \ - image2d_array_t output, \ + image2d_t output, \ int width, \ int area, \ int coord_dim \ diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 324dade..93da98e 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -875,6 +875,8 @@ static const char argmax_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ _viv_uniform int4 packedArgIdx;\n\ _viv_uniform int argLenSub1;\n\ _viv_uniform VXC_512Bits uniExtractData_2x8;\n\ +_viv_uniform VXC_512Bits uniExtract1stU8toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniExtract2ndU8toI16_2x8;\n\ \n\ #define TENSOR_ARGMAX_AXIS2_16BITS(src_type_name, dst_type_name,\\\n\ src_type, copy_type, axis_type, dst_type, inst_type) \\\n\ @@ -939,6 +941,56 @@ TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_ #define TENSOR_ARGMAX_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \\\n\ __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \\\n\ __read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \\\n\ + src_type src; \\\n\ + src_type maxVal; \\\n\ + VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + dst_type axis; \\\n\ + dst_type packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.z --; \\\n\ + do \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z --; \\\n\ + packIdx --; \\\n\ + maxVal = max(maxVal, src); \\\n\ + src_type condition; \\\n\ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + axis = condition ? packIdx : axis; \\\n\ + } while (coord.z >= 0); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS2_8BITS(I8, U8, vxc_char16, vxc_uchar16)\n\ +TENSOR_ARGMAX_AXIS2_8BITS(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +\n\ +#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \\\n\ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + int axisVal \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8)\n\ +TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8)\n\ +\n\ +#define TENSOR_ARGMAX_AXIS2_MIX(src_type_name, dst_type_name, src_type, dst_type) \\\n\ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axisVal \\\n\ ) \\\n\ @@ -967,26 +1019,49 @@ __write_only image2d_array_t output, \\\n\ \\\n\ VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -TENSOR_ARGMAX_AXIS2_8BITS(I8, I16, vxc_char8, vxc_short8)\n\ -TENSOR_ARGMAX_AXIS2_8BITS(I8, U8, vxc_char8, vxc_uchar8)\n\ -TENSOR_ARGMAX_AXIS2_8BITS(U8, I16, vxc_uchar8, vxc_short8)\n\ -TENSOR_ARGMAX_AXIS2_8BITS(U8, U8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMAX_AXIS2_MIX(I8, I16, vxc_char8, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS2_MIX(U8, I16, vxc_uchar8, vxc_short8)\n\ \n\ -#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \\\n\ - __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \\\n\ +#define TENSOR_ARGMAX_AXIS2_MIX_OPT(src_type_name, dst_type_name, src_type, dst_type) \\\n\ + __kernel void argmax_axis2_##src_type_name##to##dst_type_name##_opt( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axisVal \\\n\ ) \\\n\ { \\\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ - dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ - VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \\\n\ + src_type src; \\\n\ + src_type maxVal; \\\n\ + VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_uchar16 axis; \\\n\ + vxc_uchar16 packIdx; \\\n\ + \\\n\ + _viv_asm(COPY, axis, packedArgIdx, 16); \\\n\ + _viv_asm(COPY, packIdx, packedArgIdx, 16); \\\n\ + \\\n\ + coord.z --; \\\n\ + do \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z --; \\\n\ + packIdx --; \\\n\ + maxVal = max(maxVal, src); \\\n\ + src_type condition; \\\n\ + VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \\\n\ + axis = condition ? packIdx : axis; \\\n\ + } while (coord.z >= 0); \\\n\ + vxc_short8 dst0, dst1; \\\n\ + VXC_DP2x8(dst0, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniExtract1stU8toI16_2x8); \\\n\ + VXC_DP2x8(dst1, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniExtract2ndU8toI16_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + VXC_WriteImage(output, coord.xy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8)\n\ -TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8)\n\ -TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8)\n\ -TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8)\n\ +TENSOR_ARGMAX_AXIS2_MIX_OPT(I8, I16, vxc_char16, vxc_short8)\n\ +TENSOR_ARGMAX_AXIS2_MIX_OPT(U8, I16, vxc_uchar16, vxc_short8)\n\ "; /* end of argmax_axis2_vx*/ static const char argmin_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -2835,10 +2910,10 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ vxc_short8 w_zp = (short)weight_ZP;\n\ vxc_uchar16 input_val = 0, weight_val = 0;\n\ int temp = 0, i, j;\n\ - Tensor src_tensor = create_image_from_image2d(input, 1);\n\ - uchar *src_ptr_base = (uchar *)src_image.ptr;\n\ + Tensor src_tensor = create_tensor_from_image2d_array(input, 1);\n\ + uchar *src_ptr_base = (uchar *)src_tensor.ptr;\n\ uchar *src_ptr;\n\ - Tensor dst_tensor = create_image_from_image2d(output, 1);\n\ + Tensor dst_tensor = create_tensor_from_image2d_array(output, 1);\n\ uchar *dst_ptr = (uchar *)dst_tensor.ptr;\n\ \n\ temp = read_imagei(bias, coord.yz).x;\n\ @@ -2847,7 +2922,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ \n\ for (i = 0; i < input_height; i++)\n\ {\n\ - src_ptr = src_ptr_base + (coord.x + coord.z * src_image.stride_y);\n\ + src_ptr = src_ptr_base + (coord.x + coord.z * src_tensor.stride_y);\n\ for (j = 0; j < kernel_cnt_x16; j++)\n\ {\n\ VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\ @@ -2892,6 +2967,830 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ \n\ "; /* end of conv1d_ovxlib_k1024_vx*/ +static const char custom_softmax_vx[] = "/*\n\ + ============================================================================\n\ + Name : Softmax2.vx\n\ + Author : VSI\n\ + Version :\n\ + Copyright : Your copyright notice\n\ + Description :\n\ + ============================================================================\n\ + */\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;\n\ +_viv_uniform int sf_size;\n\ + #define F_MAX(a,b) ((a)>(b)?(a):(b))\n\ +__kernel void Softmax2VXC\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(0,0,0,0);\n\ + float fMax = 0.0;\n\ + for (int i = 0; i < sf_size; i++)\n\ + {\n\ + vxc_char8 val;\n\ + coord_in.x = i;\n\ + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ + float fval;\n\ + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ +\n\ + fMax = F_MAX(fMax, fval);\n\ + }\n\ +\n\ + float fProbSum = 0.0f;\n\ + vxc_short8 dst;\n\ + for (int i = 0; i < sf_size; i++)\n\ + {\n\ + vxc_char8 val;\n\ +\n\ + coord_in.x = i;\n\ + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ + float fval;\n\ + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ +\n\ + float fOut = (float)exp(fval - fMax);\n\ + fProbSum += fOut;\n\ + half hVal;\n\ + _viv_asm(CONV,hVal,fOut);\n\ + _viv_asm(COPY,dst,hVal, 4);\n\ + VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +\n\ + for (int i = 0; i < sf_size; i++)\n\ + {\n\ + vxc_short8 val;\n\ + vxc_half8 val_h;\n\ + coord_in.x = i;\n\ + VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ + float fval;\n\ + _viv_asm(COPY, val_h,val, 16);\n\ + VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ +\n\ + float fOut =fval/fProbSum;\n\ + half hVal;\n\ + _viv_asm(CONV,hVal,fOut);\n\ + _viv_asm(COPY,dst,hVal, 4);\n\ + VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of custom_softmax_vx*/ + +static const char custom_warp_affine_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float4 matrix0;\n\ +_viv_uniform float2 matrix1;\n\ +_viv_uniform float4 matrix4;\n\ +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_affine_bilinear_U8toU8_2D\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_affine_nearest_neighbor_U8toU8\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_affine_bilinear_U8toU8\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of custom_warp_affine_vx*/ + +static const char custom_warp_perspective_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float4 matrix0;\n\ +_viv_uniform float4 matrix1;\n\ +_viv_uniform float4 matrix2;\n\ +_viv_uniform float4 matrix4;\n\ +__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5,\n\ + float _m6,\n\ + float _m7,\n\ + float _m8\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f0 = convert_float4(coord_in);\n\ +\n\ + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ + z0.zw = z0.zw + 2 * matrix1.z;\n\ + float4 z1 = z0 + 4 * matrix1.z;\n\ +\n\ + z0 = 1.0f / z0;\n\ + z1 = 1.0f / z1;\n\ +\n\ + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + float4 coord_f = coord_f0 * z0.xxyy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z0.zzww;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.xxyy;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.zzww;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5,\n\ + float _m6,\n\ + float _m7,\n\ + float _m8\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f0 = convert_float4(coord_in);\n\ +\n\ + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ + z0.zw = z0.zw + 2 * matrix1.z;\n\ + float4 z1 = z0 + 4 * matrix1.z;\n\ +\n\ + z0 = 1.0f / z0;\n\ + z1 = 1.0f / z1;\n\ +\n\ + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + float4 coord_f = coord_f0 * z0.xxyy;\n\ +\n\ + coord_in = convert_int4(floor(coord_f));\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z0.zzww;\n\ + coord_in = convert_int4(floor(coord_f));\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.xxyy;\n\ + coord_in = convert_int4(floor(coord_f));\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.zzww;\n\ + coord_in = convert_int4(floor(coord_f));\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \\\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \\\n\ + VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));\n\ +__kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5,\n\ + float _m6,\n\ + float _m7,\n\ + float _m8\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f0 = convert_float4(coord_in);\n\ +\n\ + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ + z0.zw = z0.zw + 2 * matrix1.z;\n\ + float4 z1 = z0 + 4 * matrix1.z;\n\ +\n\ + z0 = 1.0f / z0;\n\ + z1 = 1.0f / z1;\n\ +\n\ + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + float4 coord_f = coord_f0 * z0.xxyy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 dst;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 0, 0)\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 1, 1)\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z0.zzww;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 2, 2)\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 3, 3)\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.xxyy;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 4, 4)\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 5, 5)\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.zzww;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 6, 6)\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 7, 7)\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_perspective_bilinear_U8toU8\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5,\n\ + float _m6,\n\ + float _m7,\n\ + float _m8\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f0 = convert_float4(coord_in);\n\ +\n\ + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ + z0.zw = z0.zw + 2 * matrix1.z;\n\ + float4 z1 = z0 + 4 * matrix1.z;\n\ +\n\ + z0 = 1.0f / z0;\n\ + z1 = 1.0f / z1;\n\ +\n\ + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + float4 coord_f = coord_f0 * z0.xxyy;\n\ +\n\ + coord_in = convert_int4(floor(coord_f));\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\ + IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\ + IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z0.zzww;\n\ + coord_in = convert_int4(floor(coord_f));\n\ + coord_input.xy = coord_in.xy;\n\ + IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\ + IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\ + IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.xxyy;\n\ + coord_in = convert_int4(floor(coord_f));\n\ + coord_input.xy = coord_in.xy;\n\ + IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\ + IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\ + IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.zzww;\n\ + coord_in = convert_int4(floor(coord_f));\n\ + coord_input.xy = coord_in.xy;\n\ + IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\ + IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(src0, 0, 0, 0, 1)\n\ + IMAGE_LOAD_3D(src1, 0, 1, 0, 1)\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of custom_warp_perspective_vx*/ + static const char depth2space_crd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ @@ -3198,7 +4097,8 @@ __kernel void depth2space_crd_F16toI16_blk2(\n\ VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_out.x += 8;\n\ VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of depth2space_crd_vx*/ +}\n\ +"; /* end of depth2space_crd_vx*/ static const char depthwise_conv1d_src0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -4047,6 +4947,11 @@ float4 eltwise_unary_sin(float4 x)\n\ return native_sin(x);\n\ }\n\ \n\ +float4 eltwise_unary_cos(float4 x)\n\ +{\n\ + return native_cos(x);\n\ +}\n\ +\n\ #define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ float4 eltwise_unary_exp(float4 x)\n\ @@ -4228,6 +5133,17 @@ ELTSISE_UNARY_2D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_u ELTSISE_UNARY_2D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//COS\n\ +ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ //LOG\n\ ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ ELTSISE_UNARY_2D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ @@ -4354,6 +5270,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ ELTSISE_UNARY_BF16_2D(exp)\n\ //SIN\n\ ELTSISE_UNARY_BF16_2D(sin)\n\ +//COS\n\ +ELTSISE_UNARY_BF16_2D(cos)\n\ //LOG\n\ ELTSISE_UNARY_BF16_2D(log)\n\ //ELU\n\ @@ -4382,6 +5300,11 @@ float4 eltwise_unary_sin(float4 x)\n\ return native_sin(x);\n\ }\n\ \n\ +float4 eltwise_unary_cos(float4 x)\n\ +{\n\ + return native_cos(x);\n\ +}\n\ +\n\ #define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ float4 eltwise_unary_exp(float4 x)\n\ @@ -4563,6 +5486,17 @@ ELTSISE_UNARY_3D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_u ELTSISE_UNARY_3D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//COS\n\ +ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ //LOG\n\ ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ ELTSISE_UNARY_3D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ @@ -4688,6 +5622,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ ELTSISE_UNARY_BF16(exp)\n\ //SIN\n\ ELTSISE_UNARY_BF16(sin)\n\ +//COS\n\ +ELTSISE_UNARY_BF16(cos)\n\ //LOG\n\ ELTSISE_UNARY_BF16(log)\n\ //ELU\n\ @@ -5216,8 +6152,6 @@ __kernel void gather_F16toF16(\n\ int gidz = get_global_id(2); // block_num\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ -\n\ -\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ @@ -5491,6 +6425,245 @@ GATHER_AXIS0_ARRAY(I8, vxc_char16, char, vxc_char4)\n\ GATHER_AXIS0_ARRAY(I16, vxc_short8, short, vxc_short4)\n\ GATHER_AXIS0_ARRAY(F16, vxc_short8, short, vxc_short4)"; /* end of gather_array_vx*/ +static const char gather_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int indices_num;\n\ +_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;\n\ +_viv_uniform int batch;\n\ +\n\ +__kernel void gather_batch_I8toI8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int2 coord_idx = (int2)(gidy, 0);\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0);\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\ + for(; coord_idx.y < batch;)\n\ + {\n\ + int4 indice = read_imagei(input1, coord_idx);\n\ + coord_idx.y++;\n\ + coord_in.y = gidz * axis_num + indice.x;\n\ +\n\ + vxc_char16 src;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z++;\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + }\n\ +}\n\ +\n\ +__kernel void gather_batch_U8toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int2 coord_idx = (int2)(gidy, 0);\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0);\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\ + for(; coord_idx.y < batch;)\n\ + {\n\ + int4 indice = read_imagei(input1, coord_idx);\n\ + coord_idx.y++;\n\ + coord_in.y = gidz * axis_num + indice.x;\n\ +\n\ + vxc_uchar16 src;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z++;\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + }\n\ +}\n\ +\n\ +__kernel void gather_batch_I16toI16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int2 coord_idx = (int2)(gidy, 0);\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0);\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\ + for(; coord_idx.y < batch;)\n\ + {\n\ + int4 indice = read_imagei(input1, coord_idx);\n\ + coord_idx.y++;\n\ + coord_in.y = gidz * axis_num + indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z++;\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + }\n\ +}\n\ +\n\ +__kernel void gather_batch_F16toF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int2 coord_idx = (int2)(gidy, 0);\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0);\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\ + for(; coord_idx.y < batch;)\n\ + {\n\ + int4 indice = read_imagei(input1, coord_idx);\n\ + coord_idx.y++;\n\ + coord_in.y = gidz * axis_num + indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z++;\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + }\n\ +}\n\ +\n\ +__kernel void gather_batch_I8toI8_axis0(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 indices = read_imagei(input1, coord.xz);\n\ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_char16 src, dst;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.y;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.z;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.w;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtraCopyDpKeepinEvis_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_batch_U8toU8_axis0(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 indices = read_imagei(input1, coord.xz);\n\ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 src, dst;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.y;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.z;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.w;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtraCopyDpKeepinEvis_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_batch_I16toI16_axis0(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 indices = read_imagei(input1, coord.xz);\n\ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.y;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.z;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.w;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtraCopyDpKeepinEvis_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_batch_F16toF16_axis0(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 indices = read_imagei(input1, coord.xz);\n\ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.y;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.z;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.w;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtraCopyDpKeepinEvis_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of gather_batch_vx*/ + static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int indices_num;\n\ @@ -5698,6 +6871,244 @@ __kernel void gather_I16toF16_axis0(\n\ }\n\ "; /* end of gather_mix_vx*/ +static const char gather_mix_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int indices_num;\n\ +_viv_uniform int batch;\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +#define GATHER_BATCH_8BITS_TO_F16(src0_type_name, read_type) \\\n\ +__kernel void gather_batch_##src0_type_name##toF16( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int block_num, \\\n\ + int axis_num \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + \\\n\ + int2 coord_idx = (int2)(gidy, 0); \\\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0); \\\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + \\\n\ + for(; coord_idx.y < batch;) \\\n\ + { \\\n\ + int4 indice = read_imagei(input1, coord_idx); \\\n\ + coord_idx.y++; \\\n\ + coord_in.y = gidz * axis_num + indice.x; \\\n\ + \\\n\ + read_type src; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z++; \\\n\ + vxc_half8 src0, src1; \\\n\ + vxc_short8 dst0, dst1; \\\n\ + \\\n\ + VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_DP2x8(src1,src,ms0, VXC_MODIFIER(0,7,8, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + _viv_asm(COPY, dst1, src1, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + VXC_WriteImage2DArray(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + coord.x = gidx; \\\n\ + } \\\n\ +}\n\ +GATHER_BATCH_8BITS_TO_F16(U8, vxc_uchar16)\n\ +GATHER_BATCH_8BITS_TO_F16(I8, vxc_char16)\n\ +\n\ +#define GATHER_BATCH_F16_TO_QINT(src1_type_name, write_type) \\\n\ +__kernel void gather_batch_F16to##src1_type_name( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int block_num, \\\n\ + int axis_num \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + \\\n\ + int2 coord_idx = (int2)(gidy, 0); \\\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0); \\\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + for(; coord_idx.y < batch;) \\\n\ + { \\\n\ + int4 indice = read_imagei(input1, coord_idx); \\\n\ + coord_idx.y++; \\\n\ + coord_in.y = gidz * axis_num + indice.x; \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z++; \\\n\ + \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + } \\\n\ +}\n\ +GATHER_BATCH_F16_TO_QINT(U8, vxc_uchar16)\n\ +GATHER_BATCH_F16_TO_QINT(I8, vxc_char16)\n\ +GATHER_BATCH_F16_TO_QINT(I16, vxc_short8)\n\ +\n\ +__kernel void gather_batch_I16toF16(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ +\n\ + int2 coord_idx = (int2)(gidy, 0);\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0);\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\ + vxc_half8 src0;\n\ + vxc_short8 dst0;\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + for(; coord_idx.y < batch;)\n\ + {\n\ + int4 indice = read_imagei(input1, coord_idx);\n\ + coord_idx.y++;\n\ + coord_in.y = gidz * axis_num + indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z++;\n\ + VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ + _viv_asm(COPY, dst0, src0, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + }\n\ +}\n\ +\n\ +#define GATHER_BATCH_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \\\n\ +__kernel void gather_batch_##src0_type_name##toF16_axis0( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int block_num, \\\n\ + int axis_num \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 indices = read_imagei(input1, coord.xz); \\\n\ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + read_type src; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = indices.y; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = indices.z; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = indices.w; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_half8 src0; \\\n\ + vxc_short8 dst0; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_BATCH_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)\n\ +GATHER_BATCH_8BITS_TO_F16_AXIS0(I8, vxc_char16)\n\ +\n\ +#define GATHER_BATCH_F16_TO_QINT_AXIS0(src1_type_name, write_type) \\\n\ +__kernel void gather_batch_F16to##src1_type_name##_axis0( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int block_num, \\\n\ + int axis_num \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 indices = read_imagei(input1, coord.xz); \\\n\ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = indices.y; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = indices.z; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = indices.w; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GATHER_BATCH_F16_TO_QINT_AXIS0(U8, vxc_uchar16)\n\ +GATHER_BATCH_F16_TO_QINT_AXIS0(I8, vxc_char16)\n\ +GATHER_BATCH_F16_TO_QINT_AXIS0(I16, vxc_short8)\n\ +\n\ +__kernel void gather_batch_I16toF16_axis0(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 indices = read_imagei(input1, coord.xz);\n\ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.y;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.z;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.w;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_half8 src0;\n\ + vxc_short8 dst0;\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ + _viv_asm(COPY, dst0, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of gather_mix_batch_vx*/ + static const char gather_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void gather_nd_I8toI8_1D(\n\ @@ -12425,7 +13836,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ { \\\n\ int lidx = get_local_id(0); \\\n\ int offset = get_global_id(0); \\\n\ - Image src_img = create_image_from_image2d(input, 1);\n\ + Image src_img = create_image_from_image2d(input, 1); \\\n\ uchar *src_ptr_base = (uchar *)src_img.ptr; \\\n\ uchar *src_ptr; \\\n\ vxc_uchar8 src0, src1; \\\n\ @@ -16772,6 +18183,8 @@ __kernel void logical_not_I8toI8_2D(\n\ static const char logical_ops_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ +_viv_uniform VXC_512Bits uniConvertInt16toInt8_2x8;\n\ +\n\ #define TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\\\n\ lgc_op, lgc_op2, read_fun, write_fun) \\\n\ input_type vA;\\\n\ @@ -16831,7 +18244,7 @@ out_copy_type, lgc_op, lgc_op2, read_fun, write_fun) \\\n\ VXC_DP2x8(tmpOut,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 0),uniMulShortMinus1toFp16_2x8); \\\n\ out_copy_type data; \\\n\ _viv_asm(COPY, data, tmpOut, 16); \\\n\ - write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ \n\ #define TENSORLOGICAL_FP(name0, src_type_name, dst_type_name, input_type,\\\n\ @@ -16858,6 +18271,47 @@ copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \\\n\ VXC_ReadImage, VXC_WriteImage) \\\n\ }\n\ \n\ +#define TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type,\\\n\ +out_copy_type, lgc_op, lgc_op2, read_fun, write_fun) \\\n\ + input_type vA;\\\n\ + copy_type src0;\\\n\ + input_type vB;\\\n\ + copy_type src1;\\\n\ + read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\\\n\ + _viv_asm(COPY, src0, vA, 16); \\\n\ + read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\\\n\ + _viv_asm(COPY, src1, vB, 16); \\\n\ + output_type dst; \\\n\ + dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \\\n\ + vxc_char8 data; \\\n\ + VXC_DP2x8(data,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 1),uniConvertInt16toInt8_2x8); \\\n\ + data &= 1; \\\n\ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define TENSORLOGICAL_BFP16(name0, src_type_name, dst_type_name, input_type,\\\n\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \\\n\ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name( \\\n\ + __read_only image2d_array_t in0, \\\n\ + __read_only image2d_array_t in1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{\\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\\\n\ + TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\\\n\ + VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\ +}\n\ +\n\ +#define TENSORLOGICAL_BFP16_2D(name0, src_type_name, dst_type_name, input_type,\\\n\ +copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \\\n\ + __kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \\\n\ + __read_only image2d_array_t in0, \\\n\ + __read_only image2d_array_t in1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{\\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\ + TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\\\n\ + VXC_ReadImage, VXC_WriteImage) \\\n\ +}\n\ +\n\ // name0, src_name, dst_name, input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2\n\ TENSORLOGICAL(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, )\n\ //TENSORLOGICAL(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, )\n\ @@ -16872,6 +18326,10 @@ TENSORLOGICAL(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8, vx //TENSORLOGICAL(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ //TENSORLOGICAL_FP(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ \n\ +TENSORLOGICAL_BFP16(or, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )\n\ +TENSORLOGICAL_BFP16(and, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, )\n\ +TENSORLOGICAL_BFP16(xor, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ +\n\ TENSORLOGICAL_2D(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, )\n\ //TENSORLOGICAL_2D(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, )\n\ //TENSORLOGICAL_2D(or, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )\n\ @@ -16884,6 +18342,10 @@ TENSORLOGICAL_2D(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8, //TENSORLOGICAL_2D(xor, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ^, !!)\n\ //TENSORLOGICAL_2D(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ //TENSORLOGICAL_FP_2D(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ +\n\ +TENSORLOGICAL_BFP16_2D(or, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )\n\ +TENSORLOGICAL_BFP16_2D(and, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, )\n\ +TENSORLOGICAL_BFP16_2D(xor, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ "; /* end of logical_ops_vx*/ static const char lstmunit_activation_BP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -20706,6 +22168,280 @@ LSTMUNIT_S_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoi LSTMUNIT_S_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_S_U8_vx*/ +static const char matrixmul_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +__kernel void gemm_BF16BF16toBF16(image2d_array_t inputA,\n\ + image2d_array_t inputB, image2d_array_t output,\n\ + int transposeA, int transposeB,\n\ + int adjointA, int adjointB, uint M, uint K, uint N)\n\ +{\n\ + uint gidy = get_global_id(1);\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_ushort8 valC0, valC1, src0, src1;\n\ + vxc_ushort8 srcA0, srcB0, srcA1, srcB1, outC;\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\ + {\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 4;\n\ + coord_b.y += 4;\n\ +\n\ + VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempA0, src0, 16);\n\ + VXC_DP2x8(src1, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, tempA1, src1, 16);\n\ + VXC_DP2x8(src0, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempA2, src0, 16);\n\ + VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, tempA3, src1, 16);\n\ + VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempB0, src0, 16);\n\ + VXC_DP2x8(src1, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, tempB1, src1, 16);\n\ + VXC_DP2x8(src0, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempB2, src0, 16);\n\ + VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, tempB3, src1, 16);\n\ +\n\ + sum0 += (tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3);\n\ + sum1 += (tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3);\n\ + sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3);\n\ + sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);\n\ + }\n\ + coord_b.y = gidy;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr);\n\ + _viv_asm(COPY, valC0, sum0, 16);\n\ + _viv_asm(COPY, valC1, sum1, 16);\n\ + VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(COPY, valC0, sum2, 16);\n\ + _viv_asm(COPY, valC1, sum3, 16);\n\ + VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gemm_transa_BF16BF16toBF16(\n\ + image2d_array_t inputA,\n\ + image2d_array_t inputB,\n\ + image2d_array_t output,\n\ + int transposeA,\n\ + int transposeB,\n\ + int adjointA,\n\ + int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + uint gidy = get_global_id(1);\n\ +\n\ + vxc_ushort8 valC0, valC1;\n\ + vxc_ushort8 srcA, srcB, outC, src0, src1;\n\ +\n\ + int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + vxc_float4 tempA0;\n\ + vxc_float4 tempB0;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;)\n\ + {\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.y++;\n\ + coord_b.y++;\n\ +\n\ + VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempA0, src0, 16);\n\ +\n\ + VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempB0, src1, 16);\n\ +\n\ + sum0 = (sum0 + tempA0.x * tempB0);\n\ + sum1 = (sum1 + tempA0.y * tempB0);\n\ + sum2 = (sum2 + tempA0.z * tempB0);\n\ + sum3 = (sum3 + tempA0.w * tempB0);\n\ + }\n\ + coord_b.y = gidy;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr);\n\ + _viv_asm(COPY, valC0, sum0, 16);\n\ + _viv_asm(COPY, valC1, sum1, 16);\n\ + VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_b.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_b.y++;\n\ + _viv_asm(COPY, valC0, sum2, 16);\n\ + _viv_asm(COPY, valC1, sum3, 16);\n\ + VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_b.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,\n\ + image2d_array_t inputB,\n\ + image2d_array_t output,\n\ + int transposeA,\n\ + int transposeB,\n\ + int adjointA,\n\ + int adjointB,\n\ + uint M, uint K, uint N)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ +\n\ + vxc_float4 sum0 = (vxc_float4)(0);\n\ + vxc_float4 sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0);\n\ + vxc_float4 sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_ushort8 src0, src1;\n\ +\n\ + for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\ + {\n\ + vxc_ushort8 srcA0,srcA1,srcA2,srcA3;\n\ + vxc_ushort8 srcB0,srcB1,srcB2,srcB3;\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 4;\n\ + coord_b.x += 4;\n\ +\n\ + VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempA0, src0, 16);\n\ + VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempA1, src1, 16);\n\ + VXC_DP2x8(src0, srcA2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempA2, src0, 16);\n\ + VXC_DP2x8(src1, srcA3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempA3, src1, 16);\n\ +\n\ + VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempB0, src0, 16);\n\ + VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempB1, src1, 16);\n\ + VXC_DP2x8(src0, srcB2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempB2, src0, 16);\n\ + VXC_DP2x8(src1, srcB3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, tempB3, src1, 16);\n\ +\n\ + sum0 += (float4)(dot(tempA0, tempB0), dot(tempA0, tempB1), dot(tempA0, tempB2), dot(tempA0, tempB3));\n\ + sum1 += (float4)(dot(tempA1, tempB0), dot(tempA1, tempB1), dot(tempA1, tempB2), dot(tempA1, tempB3));\n\ + sum2 += (float4)(dot(tempA2, tempB0), dot(tempA2, tempB1), dot(tempA2, tempB2), dot(tempA2, tempB3));\n\ + sum3 += (float4)(dot(tempA3, tempB0), dot(tempA3, tempB1), dot(tempA3, tempB2), dot(tempA3, tempB3));\n\ + }\n\ +\n\ + vxc_ushort8 valC0, valC1, valDst;\n\ + _viv_asm(COPY, valC0, sum0, 16);\n\ + _viv_asm(COPY, valC1, sum1, 16);\n\ + VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + _viv_asm(COPY, valC0, sum2, 16);\n\ + _viv_asm(COPY, valC1, sum3, 16);\n\ + VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of matrixmul_bf16_vx*/ + static const char matrixmul_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ @@ -20719,6 +22455,9 @@ _viv_uniform int ac2zero;\n\ _viv_uniform int bc2zero;\n\ \n\ _viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ #if (VX_VERSION==2)\n\ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ @@ -20900,14 +22639,9 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ }\n\ #endif\n\ \n\ -__kernel void gemm_F32F32toF32(image2d_array_t inputA,\n\ - image2d_array_t inputB,\n\ - image2d_array_t output,\n\ - int transposeA,\n\ - int transposeB,\n\ - int adjointA,\n\ - int adjointB,\n\ - uint M, uint K, uint N)\n\ +__kernel void gemm_F32F32toF32(\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output,\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)\n\ {\n\ uint gidx = get_global_id(0);\n\ uint gidy = get_global_id(1);\n\ @@ -20915,10 +22649,8 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,\n\ int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);\n\ int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0);\n\ \n\ - vxc_float4 sum0 = (vxc_float4)(0);\n\ - vxc_float4 sum1 = (vxc_float4)(0);\n\ - vxc_float4 sum2 = (vxc_float4)(0);\n\ - vxc_float4 sum3 = (vxc_float4)(0);\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\ \n\ vxc_int4 tmpOut0, tmpOut1;\n\ vxc_uchar16 outC;\n\ @@ -20932,7 +22664,6 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,\n\ \n\ coord_a.x = i;\n\ coord_a.y = gidy;\n\ -\n\ coord_b.x = gidx;\n\ coord_b.y = i;\n\ \n\ @@ -20965,7 +22696,8 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,\n\ write_imagef(output, coord_b, sum2);\n\ coord_b.y++;\n\ write_imagef(output, coord_b, sum3);\n\ -}"; /* end of matrixmul_f16_vx*/ +}\n\ +"; /* end of matrixmul_f16_vx*/ static const char matrixmul_f16f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -23660,6 +25392,62 @@ __kernel void maximum_U8U8toU8_2D\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ +__kernel void maximum_U8U8toI16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 src0, src1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_short8 dst0, dst1, dst;\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + dst = max(dst0, dst1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_U8U8toI16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar16 src0, src1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_short8 dst0, dst1, dst;\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + dst = max(dst0, dst1);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;\n\ _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;\n\ __kernel void maximum_I16I16toI16\n\ @@ -24198,6 +25986,66 @@ __kernel void maximum_F16F16toI16_2D\n\ tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);\n\ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +__kernel void maximum_I16I16toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 dst0, dst1, dst;\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + dst = max(dst0, dst1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void maximum_I16I16toU8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1;\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 dst0, dst1, dst;\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + dst = max(dst0, dst1);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }"; /* end of maximum_i16_vx*/ static const char minimum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -24426,6 +26274,62 @@ __kernel void minimum_U8U8toU8_2D\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ +__kernel void minimum_U8U8toI16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_uchar16 src0, src1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_short8 dst0, dst1, dst;\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + dst = min(dst0, dst1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_U8U8toI16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_uchar16 src0, src1;\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_short8 dst0, dst1, dst;\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + dst = min(dst0, dst1);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ _viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;\n\ _viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;\n\ __kernel void minimum_I16I16toI16\n\ @@ -24968,6 +26872,66 @@ __kernel void minimum_F16F16toI16_2D\n\ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ \n\ VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +__kernel void minimum_I16I16toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src0, src1;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 dst0, dst1, dst;\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + dst = min(dst0, dst1);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void minimum_I16I16toU8_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_short8 src0, src1;\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_uchar16 dst0, dst1, dst;\n\ + vxc_ushort8 mp0, mp1;\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift0_Lo_2x8);\n\ + VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniU8MulAndPostShift1_Lo_2x8);\n\ + dst = min(dst0, dst1);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }"; /* end of minimum_i16_vx*/ static const char moments_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -24989,6 +26953,9 @@ _viv_uniform float e2InScale;\n\ _viv_uniform float rowSumScale;\n\ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ #define MOMENTS_AXIS0_QINT(src0_type_name, read0_type) \\\n\ __kernel void moments_axis0_##src0_type_name##toF16( \\\n\ @@ -25236,6 +27203,88 @@ __kernel void moments_axis0_I16toF16_2D(\n\ \n\ VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void moments_axis0_BF16toBF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidy = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(0, gidy, gidz, 0);\n\ + vxc_ushort8 src0, src1;\n\ + vxc_ushort8 val;\n\ + vxc_float4 mean_vari0 = (vxc_float4)(0);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 vec0, vec1;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, vec1, src1, 16);\n\ + mean_vari0.x += dot(vec0, one) + dot(vec1, one);\n\ + mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);\n\ + }\n\ +\n\ + mean_vari0 *= dimRatio;\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;\n\ +\n\ + int2 coord_out = (int2)(gidy, gidz);\n\ +\n\ + vxc_short8 dst;\n\ + _viv_asm(COPY, src0, mean_vari0, 16);\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void moments_axis0_BF16toBF16_2D(\n\ + image2d_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidy = get_global_id(0);\n\ + int2 coord = (int2)(0, gidy);\n\ + vxc_ushort8 src0, src1;\n\ + vxc_ushort8 val;\n\ + vxc_float4 mean_vari0 = (vxc_float4)(0);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 vec0, vec1;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, vec1, src1, 16);\n\ + mean_vari0.x += dot(vec0, one) + dot(vec1, one);\n\ + mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);\n\ + }\n\ + mean_vari0 *= dimRatio;\n\ + mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;\n\ +\n\ + int2 coord_out = (int2)(gidy, 0);\n\ +\n\ + vxc_short8 dst;\n\ + _viv_asm(COPY, src0, mean_vari0, 16);\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of moments_axis0_vx*/ static const char moments_axis01_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -25683,6 +27732,9 @@ _viv_uniform float e2InScale;\n\ _viv_uniform float rowSumScale;\n\ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ #define MOMENTS_AXIS012_QINT(src0_type_name, read0_type) \\\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toF16( \\\n\ @@ -25901,6 +27953,81 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_I1 VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_BF16toBF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis,\n\ + int axis_num)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int4 coord = (int4)(gidx, 0, 0, 0);\n\ + vxc_float4 sumsqr;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ + float tmpSum = 0;\n\ + float tmpSqr = 0;\n\ + vxc_ushort8 src0, src1;\n\ + vxc_ushort8 val;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\ +\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + float4 vec0, vec1;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, vec1, src1, 16);\n\ + tmpSum += dot(vec0, one) + dot(vec1, one);\n\ + tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);\n\ + }\n\ + }\n\ + }\n\ + lcl_sum[lidx] = tmpSum;\n\ + lcl_sqr[lidx] = tmpSqr;\n\ +\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = (float)(0);\n\ + float sqr = (float)(0);\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean_vari;\n\ + mean_vari.x = sum * dimRatio;\n\ + mean_vari.y = sqr * dimRatio;\n\ + mean_vari.y = mean_vari.y - mean_vari.x * mean_vari.x;\n\ +\n\ + vxc_short8 dst;\n\ + _viv_asm(COPY, src0, mean_vari, 16);\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ }"; /* end of moments_axis012_vx*/ static const char moments_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -25915,6 +28042,8 @@ _viv_uniform float e2InScale;\n\ _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ #define MOMENTS_AXIS1_QINT(src0_type_name, read0_type) \\\n\ __kernel void moments_axis1_##src0_type_name##toF16( \\\n\ @@ -26102,6 +28231,88 @@ __kernel void moments_axis1_F16toF16_2D(\n\ VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ }\n\ +\n\ +__kernel void moments_axis1_BF16toBF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_ushort8 src0;\n\ + vxc_ushort8 val;\n\ + vxc_float4 sum = (vxc_float4)(0);\n\ + vxc_float4 sqr = (vxc_float4)(0);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + float4 vec0;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ +\n\ + sum += vec0;\n\ + sqr += (vec0 * vec0);\n\ + }\n\ +\n\ + vxc_float4 mean = sum * dimRatio;\n\ + vxc_float4 vari = sqr * dimRatio;\n\ + vari = vari - mean * mean;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidz);\n\ + vxc_short8 tmpdst0, tmpdst1, dst;\n\ + _viv_asm(COPY, tmpdst0, mean, 16);\n\ + _viv_asm(COPY, tmpdst1, vari, 16);\n\ + VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void moments_axis1_BF16toBF16_2D(\n\ + image2d_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, 0);\n\ + vxc_ushort8 src0;\n\ + vxc_ushort8 val;\n\ + vxc_float4 sum = (vxc_float4)(0);\n\ + vxc_float4 sqr = (vxc_float4)(0);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + float4 vec0;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ +\n\ + sum += vec0;\n\ + sqr += (vec0 * vec0);\n\ + }\n\ +\n\ + vxc_float4 mean = sum * dimRatio;\n\ + vxc_float4 vari = sqr * dimRatio;\n\ + vari = vari - mean * mean;\n\ +\n\ + int2 coord_out = (int2)(gidx, 0);\n\ + vxc_short8 tmpdst0, tmpdst1, dst;\n\ + _viv_asm(COPY, tmpdst0, mean, 16);\n\ + _viv_asm(COPY, tmpdst1, vari, 16);\n\ + VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ "; /* end of moments_axis1_vx*/ static const char moments_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -26115,6 +28326,8 @@ _viv_uniform float e2InScale;\n\ _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ #define MOMENTS_AXIS2_QINT(src0_type_name, read0_type) \\\n\ __kernel void moments_axis2_##src0_type_name##toF16( \\\n\ @@ -26203,6 +28416,50 @@ __kernel void moments_axis2_F16toF16(\n\ _viv_asm(COPY, dst, tmpVal, 16);\n\ VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void moments_axis2_BF16toBF16(\n\ + image2d_array_t input,\n\ + image2d_t output_mean,\n\ + image2d_t output_vari,\n\ + int axis,\n\ + int axis_num)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord = (int4)(gidx, gidy, 0, 0);\n\ + vxc_ushort8 src0;\n\ + vxc_ushort8 val;\n\ + vxc_float4 sum = (vxc_float4)(0);\n\ + vxc_float4 sqr = (vxc_float4)(0);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + float4 vec0;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ +\n\ + sum += vec0;\n\ + sqr += (vec0 * vec0);\n\ + }\n\ +\n\ + vxc_float4 mean = sum * dimRatio;\n\ + vxc_float4 vari = sqr * dimRatio;\n\ + vari = vari - mean * mean;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidy);\n\ +\n\ + vxc_short8 tmpdst0, tmpdst1, dst;\n\ + _viv_asm(COPY, tmpdst0, mean, 16);\n\ + _viv_asm(COPY, tmpdst1, vari, 16);\n\ + VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of moments_axis2_vx*/ static const char moments_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -26570,6 +28827,9 @@ _viv_uniform float rowSumScale;\n\ _viv_uniform float4 output_ZP;\n\ _viv_uniform float4 outputScale;\n\ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ #define MOMENTS_AXIS012_QINT_U8(src0_type_name, read0_type) \\\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toU8( \\\n\ @@ -26627,7 +28887,144 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_## VXC_WriteImage(output_vari, coord_out, src0.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ } \\\n\ }\n\ -MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16)"; /* end of moments_u8_axis012_vx*/ +MOMENTS_AXIS012_QINT_U8(U8, vxc_uchar16)\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF16toBF16(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ + float tmpSum = 0;\n\ + float tmpSqr = 0;\n\ + vxc_ushort8 src0, src1;\n\ + vxc_ushort8 val;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + float4 vec0, vec1;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, vec1, src1, 16);\n\ + tmpSum += dot(vec0, one) + dot(vec1, one);\n\ + tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSum;\n\ + lcl_sqr[lidx] = tmpSqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(gidz, 0);\n\ + if(lidx == 0)\n\ + {\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0.0f;\n\ + float sqr = 0.0f;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean_vari;\n\ + mean_vari.x = sum * dimRatio;\n\ + mean_vari.y = sqr * dimRatio;\n\ + mean_vari.y = mean_vari.y - mean_vari.x * mean_vari.x;\n\ +\n\ + vxc_short8 dst;\n\ + _viv_asm(COPY, src0, mean_vari, 16);\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF16toBF16_2D(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int2 coord = (int2)(gidx, 0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ + float tmpSum = 0;\n\ + float tmpSqr = 0;\n\ + vxc_ushort8 src0, src1;\n\ + vxc_ushort8 val;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 128)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ +\n\ + float4 vec0, vec1;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, vec1, src1, 16);\n\ + tmpSum += dot(vec0, one) + dot(vec1, one);\n\ + tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSum;\n\ + lcl_sqr[lidx] = tmpSqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0.0f;\n\ + float sqr = 0.0f;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 mean_vari;\n\ + mean_vari.x = sum * dimRatio;\n\ + mean_vari.y = sqr * dimRatio;\n\ + mean_vari.y = mean_vari.y - mean_vari.x * mean_vari.x;\n\ +\n\ + vxc_short8 dst;\n\ + _viv_asm(COPY, src0, mean_vari, 16);\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of moments_u8_axis012_vx*/ static const char one_hot_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -26834,7 +29231,94 @@ __kernel void one_hot_##name0##to##name1##_2D \\\n\ ONE_HOT_ASYM_SH_IMPL_2D(U8, F16, vxc_uchar8, vxc_uchar8, vxc_ushort8)\n\ ONE_HOT_ASYM_SH_IMPL_2D(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ \n\ -"; /* end of one_hot_vx*/ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +__kernel void one_hot_BF16toBF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int suffix_sz,\n\ + int on_val,\n\ + int off_val\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0));\n\ +\n\ + vxc_ushort8 src0, src1;\n\ + vxc_ushort8 val;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 vec0, vec1;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ + VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, vec1, src1, 16);\n\ + int4 data0 = convert_int4(vec0);\n\ + int4 data1 = convert_int4(vec1);\n\ +\n\ + do\n\ + {\n\ + int4 d0 = data0 == coord.zzzz ? on_val : off_val;\n\ + int4 d1 = data1 == coord.zzzz ? on_val : off_val;\n\ +\n\ + vxc_short8 dst;\n\ + VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord.z ++;\n\ + } while (coord.z < depth);\n\ +}\n\ +\n\ +__kernel void one_hot_BF16toBF16_2D\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int suffix_sz,\n\ + int on_val,\n\ + int off_val\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_ushort8 src0, src1;\n\ + vxc_ushort8 val;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 vec0;\n\ + VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ + int4 data = convert_int4(vec0);\n\ + int4 data0, data1;\n\ + int4 d4 = (int4)(0, 1, 2, 3);\n\ + do\n\ + {\n\ + coord.zw = coord.xx + (int2)(0, 1);\n\ + vxc_short8 dst;\n\ + data0 = data.xxxx == d4 ? on_val : off_val;\n\ + data1 = data.yyyy == d4 ? on_val : off_val;\n\ +\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ +\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.zw = coord.zw + (int2)(2, 2);\n\ +\n\ + data0 = data.zzzz == d4 ? on_val : off_val;\n\ + data1 = data.wwww == d4 ? on_val : off_val;\n\ +\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ +\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));\n\ + d4 += 4;\n\ + coord.y += 4;\n\ + } while (coord.y < depth);\n\ +}"; /* end of one_hot_vx*/ static const char poolwithargmax_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -35039,13 +37523,15 @@ _viv_uniform float input1Tail;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ \n\ #define COMPARISONS_2D(func_name, src0_type_name, src1_type_name, \\\n\ src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \\\n\ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \\\n\ - __read_only image2d_array_t input0, \\\n\ - __read_only image2d_array_t input1, \\\n\ - __write_only image2d_array_t output \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -35144,6 +37630,45 @@ COMPARISONS_2D(not_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half COMPARISONS_2D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=)\n\ COMPARISONS_2D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, !=)\n\ \n\ +#define COMPARISONS_BF_2D(func_name, src0_type_name, src1_type_name, cmp_op) \\\n\ +__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_ushort8 src0, src1, srcA, srcB; \\\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 vecA0, vecA1; \\\n\ + float4 vecB0, vecB1; \\\n\ + VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA0, src0, 16); \\\n\ + VXC_DP2x8(src1, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, vecA1, src1, 16); \\\n\ + VXC_DP2x8(src0, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecB0, src0, 16); \\\n\ + VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, vecB1, src1, 16); \\\n\ + int4 dst0, dst1; \\\n\ + dst0 = (vecA0)cmp_op(vecB0); \\\n\ + dst1 = (vecA1)cmp_op(vecB1); \\\n\ + \\\n\ + vxc_char16 dst; \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst &= 1; \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +COMPARISONS_BF_2D(less, BF16, BF16, <)\n\ +COMPARISONS_BF_2D(great, BF16, BF16, >)\n\ +COMPARISONS_BF_2D(less_equal, BF16, BF16, <=)\n\ +COMPARISONS_BF_2D(great_equal, BF16, BF16, >=)\n\ +COMPARISONS_BF_2D(equal, BF16, BF16, ==)\n\ +COMPARISONS_BF_2D(not_equal, BF16, BF16, !=)\n\ "; /* end of relational_ops_2d_vx*/ static const char relational_ops_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -35155,6 +37680,8 @@ _viv_uniform float input1Tail;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ \n\ #define COMPARISONS_3D(func_name, src0_type_name, src1_type_name, \\\n\ src0_type, src0_copy_type, src1_type, src1_copy_type, cmp_op) \\\n\ @@ -35260,6 +37787,45 @@ COMPARISONS_3D(not_equal, U8, F16, vxc_uchar8, vxc_uchar8, vxc_short8, vxc_half COMPARISONS_3D(not_equal, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, !=)\n\ COMPARISONS_3D(not_equal, I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, !=)\n\ \n\ +#define COMPARISONS_BF_3D(func_name, src0_type_name, src1_type_name, cmp_op) \\\n\ +__kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_3D( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + vxc_ushort8 src0, src1, srcA, srcB; \\\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 vecA0, vecA1; \\\n\ + float4 vecB0, vecB1; \\\n\ + VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA0, src0, 16); \\\n\ + VXC_DP2x8(src1, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, vecA1, src1, 16); \\\n\ + VXC_DP2x8(src0, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecB0, src0, 16); \\\n\ + VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, vecB1, src1, 16); \\\n\ + int4 dst0, dst1; \\\n\ + dst0 = (vecA0)cmp_op(vecB0); \\\n\ + dst1 = (vecA1)cmp_op(vecB1); \\\n\ + \\\n\ + vxc_char16 dst; \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst &= 1; \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +COMPARISONS_BF_3D(less, BF16, BF16, <)\n\ +COMPARISONS_BF_3D(great, BF16, BF16, >)\n\ +COMPARISONS_BF_3D(less_equal, BF16, BF16, <=)\n\ +COMPARISONS_BF_3D(great_equal, BF16, BF16, >=)\n\ +COMPARISONS_BF_3D(equal, BF16, BF16, ==)\n\ +COMPARISONS_BF_3D(not_equal, BF16, BF16, !=)\n\ "; /* end of relational_ops_3d_vx*/ static const char relu_keras_vx[] = "\n\ @@ -38763,7 +41329,7 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\ }\n\ "; /* end of resize_bilinear_U8_vx*/ -static const char resize_bilinear_U8_half_pixel_centers_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char resize_bilinear_U8_half_pixel_centers_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniResize2xUp_0_4x8;\n\ _viv_uniform VXC_512Bits uniResize2xUp_1_4x8;\n\ @@ -38992,7 +41558,138 @@ __kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers\n\ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ }\n\ -"; /* end of resize_bilinear_U8_half_pixel_centers_vx*/ +"; /* end of resize_bilinear_U8_half_pixel_centers_1_vx*/ + +static const char resize_bilinear_U8_half_pixel_centers_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int out_height;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;\n\ +__kernel void resize_bilinear_U8toU8_SAME_8x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\ + coord_in.x = (coord_out.x * 2 - 7) >> 4;\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;\n\ +\n\ + vxc_uchar16 in0, in1, in2, tmp, dst0, dst1, dst2, dst3;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + while (coord_out.y < out_height)\n\ + {\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\ + VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);\n\ + VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\ + VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);\n\ + VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\ + VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);\n\ + VXC_DP4x8(dst2, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\ + VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);\n\ + VXC_DP4x8(dst3, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\ + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);\n\ + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\ + VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);\n\ + VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\ + VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);\n\ + VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y += 2;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\ + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);\n\ + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\ + VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);\n\ + VXC_DP4x8(dst2, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\ + VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);\n\ + VXC_DP4x8(dst3, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + }\n\ +}\n\ +"; /* end of resize_bilinear_U8_half_pixel_centers_2_vx*/ static const char resize_bilinear_U8_opt_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -39651,10 +42348,14 @@ _viv_uniform int offsetX;\n\ _viv_uniform int offsetY;\n\ _viv_uniform int offsetZ;\n\ \n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ __kernel void scatter_nd_F16toF16(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ - image2d_array_t output,\n\ + image2d_t output,\n\ int width,\n\ int area,\n\ int coord_dim\n\ @@ -39682,11 +42383,53 @@ __kernel void scatter_nd_F16toF16(\n\ VXC_WriteImage(output, (int2)(gidx, gidy), tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ +__kernel void scatter_nd_BF16toBF16(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + image2d_t output,\n\ + int width,\n\ + int area,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float4 sum0 = (float4)(0);\n\ + float4 sum1 = (float4)(0);\n\ + vxc_ushort8 tmpVal;\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice = read_imagei(input0, (int2)(0, i));\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ;\n\ + if(gidy == idx)\n\ + {\n\ + vxc_ushort8 src0, src1;\n\ + VXC_ReadImage(tmpVal, input1, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 vec0, vec1;\n\ + VXC_DP2x8(src0, tmpVal, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, vec0, src0, 16);\n\ + VXC_DP2x8(src1, tmpVal, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, vec1, src1, 16);\n\ + sum0 += vec0;\n\ + sum1 += vec1;\n\ + }\n\ + }\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + _viv_asm(COPY, dst0, sum0, 16);\n\ + _viv_asm(COPY, dst1, sum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, (int2)(gidx, gidy), dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ #define SCATTER_ND_QINT(src0_type_name, data_type) \\\n\ __kernel void scatter_nd_##src0_type_name##to##src0_type_name##( \\\n\ __read_only image2d_t input0, \\\n\ __read_only image2d_t input1, \\\n\ - image2d_array_t output, \\\n\ + image2d_t output, \\\n\ int width, \\\n\ int area, \\\n\ int coord_dim \\\n\ @@ -42914,7 +45657,7 @@ static const char vsi_nn_kernel_header_vx[] = "/*\n\ Description :\n\ ============================================================================\n\ */\n\ -#include \"cl_viv_vx_ext.h\"\n\ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ \n\ typedef struct Image\n\ {\n\ @@ -44567,6 +47310,45 @@ CAST_TO_BOOL_FUN_2D(U32, uint4, read_imageui)\n\ \n\ "; /* end of cast_cl*/ +static const char clip_BF16_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ +\n\ +__kernel void clip_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + uint4 src0 = read_imageui(input, coord);\n\ + src0 = src0 << 16;\n\ + float4 src;\n\ + _viv_asm(COPY, src, src0, 16);\n\ + float4 dst0 = clamp(src, minData, maxData);\n\ + uint4 dst;\n\ + _viv_asm(COPY, dst, dst0, 16);\n\ + dst = dst >> 16;\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void clip_BF16toBF16_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float minData,\n\ + float maxData)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + uint4 src0 = read_imageui(input, coord);\n\ + src0 = src0 << 16;\n\ + float4 src;\n\ + _viv_asm(COPY, src, src0, 16);\n\ + float4 dst0 = clamp(src, minData, maxData);\n\ + uint4 dst;\n\ + _viv_asm(COPY, dst, dst0, 16);\n\ + dst = dst >> 16;\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +"; /* end of clip_BF16_cl*/ + static const char clip_F32_cl[] = "__kernel void clip_F32toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ @@ -44708,6 +47490,25 @@ __kernel void clip_U8toF32_2D(\n\ }\n\ "; /* end of clip_U8_cl*/ +static const char depth2space_crd_cl[] = "\n\ +__kernel void depth2space_crd_F32toF32(\n\ + image2d_array_t input, image2d_array_t output, int block_size)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\ + int block_e2 = block_size * block_size;\n\ + ushort blk = (ushort)block_size;\n\ + int inx = (int)((ushort)gidx / blk);\n\ + int iny = (int)((ushort)gidy / blk);\n\ + int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2;\n\ + int4 coord_in = (int4)(inx, iny, inz, 0);\n\ + float4 data = read_imagef(input, coord_in);\n\ + write_imagef(output, coord_out, data);\n\ +}\n\ +"; /* end of depth2space_crd_cl*/ + static const char detect_post_box_cl[] = "float exp_(float x, float logE)\n\ {\n\ x *= logE;\n\ @@ -44919,6 +47720,11 @@ static const char eltwise_unary_cl[] = "float eltwise_unary_sin(float x, float a return native_sin(x);\n\ }\n\ \n\ +float eltwise_unary_cos(float x, float alpha, float beta)\n\ +{\n\ + return native_cos(x);\n\ +}\n\ +\n\ #define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ float eltwise_unary_exp(float x, float alpha, float beta)\n\ @@ -45051,6 +47857,7 @@ __kernel void func_name##_F32toF32 \\\n\ write_imagef(output, coord, dst.xxxx); \\\n\ }\n\ ELTWISE_UNARY_F32(sin)\n\ +ELTWISE_UNARY_F32(cos)\n\ ELTWISE_UNARY_F32(exp)\n\ ELTWISE_UNARY_F32(log)\n\ ELTWISE_UNARY_F32(elu)\n\ @@ -45084,6 +47891,7 @@ __kernel void func_name##_F32toF32_2D \\\n\ write_imagef(output, coord, dst.xxxx); \\\n\ }\n\ ELTWISE_UNARY_F32_2D(sin)\n\ +ELTWISE_UNARY_F32_2D(cos)\n\ ELTWISE_UNARY_F32_2D(exp)\n\ ELTWISE_UNARY_F32_2D(log)\n\ ELTWISE_UNARY_F32_2D(elu)\n\ @@ -45118,6 +47926,7 @@ __kernel void func_name##_U8toU8 \\\n\ write_imageui(output, coord, dst); \\\n\ }\n\ ELTWISE_UNARY_U8(sin)\n\ +ELTWISE_UNARY_U8(cos)\n\ ELTWISE_UNARY_U8(exp)\n\ ELTWISE_UNARY_U8(log)\n\ ELTWISE_UNARY_U8(elu)\n\ @@ -45152,6 +47961,7 @@ __kernel void func_name##_U8toU8_2D \\\n\ write_imageui(output, coord, dst); \\\n\ }\n\ ELTWISE_UNARY_U8_2D(sin)\n\ +ELTWISE_UNARY_U8_2D(cos)\n\ ELTWISE_UNARY_U8_2D(exp)\n\ ELTWISE_UNARY_U8_2D(log)\n\ ELTWISE_UNARY_U8_2D(elu)\n\ @@ -45319,10 +48129,18 @@ __kernel void func_name##_U8toU8_2D \\\n\ ELTWISE_UNARY_U8_2D(erf)\n\ "; /* end of erf_cl*/ -static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32(\n\ +static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32\n\ + (\n\ __read_only image2d_array_t input,\n\ __read_only image2d_array_t input1,\n\ - __write_only image2d_array_t output)\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ float4 src0;\n\ @@ -45333,10 +48151,18 @@ static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32(\n\ write_imagef(output, coord, dst);\n\ }\n\ \n\ -__kernel void floordiv_F32F32toF32_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output)\n\ +__kernel void floordiv_F32F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ float4 src0 = read_imagef(input, coord);\n\ @@ -45345,33 +48171,8 @@ __kernel void floordiv_F32F32toF32_2D(\n\ write_imagef(output, coord, dst);\n\ }\n\ \n\ -__kernel void floordiv_I32I32toI32(\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_array_t input1,\n\ - __write_only image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - int4 src0;\n\ - int4 src1;\n\ - READ_IMAGEI_2DARRAY(src0, input, coord);\n\ - READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ - int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));\n\ - write_imagei(output, coord, dst);\n\ -}\n\ -\n\ -__kernel void floordiv_I32I32toI32_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ - int4 src0 = read_imagei(input, coord);\n\ - int4 src1 = read_imagei(input1, coord);\n\ - int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));\n\ - write_imagei(output, coord, dst);\n\ -}\n\ -\n\ -__kernel void floordiv_I32I32toU8(\n\ +__kernel void floordiv_I32I32toI32\n\ + (\n\ __read_only image2d_array_t input,\n\ __read_only image2d_array_t input1,\n\ __write_only image2d_array_t output,\n\ @@ -45380,7 +48181,56 @@ __kernel void floordiv_I32I32toU8(\n\ float input1Scale,\n\ float input1Tail,\n\ float outputScale,\n\ - float outputTail )\n\ + float outputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 src0;\n\ + int4 src1;\n\ + READ_IMAGEI_2DARRAY(src0, input, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ + float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + float4 out = floor(in0 / in1) * outputScale + outputTail;\n\ + int4 dst = convert_int4(out);\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void floordiv_I32I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 src0 = read_imagei(input, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ + float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + float4 out = floor(in0 / in1) * outputScale + outputTail;\n\ + int4 dst = convert_int4(out);\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void floordiv_I32I32toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 src0;\n\ @@ -45394,16 +48244,18 @@ __kernel void floordiv_I32I32toU8(\n\ write_imageui(output, coord, dst);\n\ }\n\ \n\ -__kernel void floordiv_I32I32toU8_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ - float input0Scale,\n\ - float input0Tail,\n\ - float input1Scale,\n\ - float input1Tail,\n\ - float outputScale,\n\ - float outputTail )\n\ +__kernel void floordiv_I32I32toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ int4 src0 = read_imagei(input, coord);\n\ @@ -45415,7 +48267,8 @@ __kernel void floordiv_I32I32toU8_2D(\n\ write_imageui(output, coord, dst);\n\ }\n\ \n\ -__kernel void floordiv_U8U8toU8(\n\ +__kernel void floordiv_U8U8toU8\n\ + (\n\ __read_only image2d_array_t input,\n\ __read_only image2d_array_t input1,\n\ __write_only image2d_array_t output,\n\ @@ -45424,7 +48277,8 @@ __kernel void floordiv_U8U8toU8(\n\ float input1Scale,\n\ float input1Tail,\n\ float outputScale,\n\ - float outputTail )\n\ + float outputTail\n\ + )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ uint4 src0, src1;\n\ @@ -45438,16 +48292,18 @@ __kernel void floordiv_U8U8toU8(\n\ write_imageui(output, coord, dst);\n\ }\n\ \n\ -__kernel void floordiv_U8U8toU8_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ - float input0Scale,\n\ - float input0Tail,\n\ - float input1Scale,\n\ - float input1Tail,\n\ - float outputScale,\n\ - float outputTail )\n\ +__kernel void floordiv_U8U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ uint4 src0 = read_imageui(input, coord);\n\ @@ -45460,7 +48316,8 @@ __kernel void floordiv_U8U8toU8_2D(\n\ write_imageui(output, coord, dst);\n\ }\n\ \n\ -__kernel void floordiv_U8I32toU8(\n\ +__kernel void floordiv_U8I32toU8\n\ + (\n\ __read_only image2d_array_t input,\n\ __read_only image2d_array_t input1,\n\ __write_only image2d_array_t output,\n\ @@ -45469,7 +48326,8 @@ __kernel void floordiv_U8I32toU8(\n\ float input1Scale,\n\ float input1Tail,\n\ float outputScale,\n\ - float outputTail )\n\ + float outputTail\n\ + )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ uint4 src0;\n\ @@ -45484,16 +48342,18 @@ __kernel void floordiv_U8I32toU8(\n\ write_imageui(output, coord, dst);\n\ }\n\ \n\ -__kernel void floordiv_U8I32toU8_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ - float input0Scale,\n\ - float input0Tail,\n\ - float input1Scale,\n\ - float input1Tail,\n\ - float outputScale,\n\ - float outputTail )\n\ +__kernel void floordiv_U8I32toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ uint4 src0 = read_imageui(input, coord);\n\ @@ -45514,7 +48374,8 @@ static const char gather_cl[] = "__kernel void gather_U8toU8(\n\ int block_size,\n\ int block_num,\n\ int axis_num,\n\ - int indices_num\n\ + int indices_num,\n\ + int batch\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -45538,7 +48399,8 @@ __kernel void gather_F16toF16(\n\ int block_size,\n\ int block_num,\n\ int axis_num,\n\ - int indices_num\n\ + int indices_num,\n\ + int batch\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -45562,7 +48424,8 @@ __kernel void gather_I32toI32(\n\ int block_size,\n\ int block_num,\n\ int axis_num,\n\ - int indices_num\n\ + int indices_num,\n\ + int batch\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -45586,7 +48449,8 @@ __kernel void gather_F32toF32(\n\ int block_size,\n\ int block_num,\n\ int axis_num,\n\ - int indices_num\n\ + int indices_num,\n\ + int batch\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ @@ -45604,6 +48468,131 @@ __kernel void gather_F32toF32(\n\ }\n\ "; /* end of gather_cl*/ +static const char gather_batch_cl[] = "__kernel void gather_batch_U8toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num,\n\ + int indices_num,\n\ + int batch\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int2 coord_idx = (int2)(gidy, 0);\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0);\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\ + for(; coord_idx.y < batch;)\n\ + {\n\ + int4 indice = read_imagei(input1, coord_idx);\n\ + coord_idx.y++;\n\ + coord_in.y = gidz * axis_num + indice.x;\n\ +\n\ + uint4 data = read_imageui(input0, coord_in);\n\ + coord_in.z++;\n\ + write_imageui(output, coord, data);\n\ + coord.z++;\n\ + }\n\ +}\n\ +\n\ +__kernel void gather_batch_F16toF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num,\n\ + int indices_num,\n\ + int batch\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int2 coord_idx = (int2)(gidy, 0);\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0);\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\ + for(; coord_idx.y < batch;)\n\ + {\n\ + int4 indice = read_imagei(input1, coord_idx);\n\ + coord_idx.y++;\n\ + coord_in.y = gidz * axis_num + indice.x;\n\ +\n\ + float4 data = read_imagef(input0, coord_in);\n\ + coord_in.z++;\n\ + write_imagef(output, coord, data);\n\ + coord.z++;\n\ + }\n\ +}\n\ +\n\ +__kernel void gather_batch_I32toI32(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num,\n\ + int indices_num,\n\ + int batch\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int2 coord_idx = (int2)(gidy, 0);\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0);\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\ + for(; coord_idx.y < batch;)\n\ + {\n\ + int4 indice = read_imagei(input1, coord_idx);\n\ + coord_idx.y++;\n\ + coord_in.y = gidz * axis_num + indice.x;\n\ +\n\ + int4 data = read_imagei(input0, coord_in);\n\ + coord_in.z++;\n\ + write_imagei(output, coord, data);\n\ + coord.z++;\n\ + }\n\ +}\n\ +\n\ +__kernel void gather_batch_F32toF32(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num,\n\ + int indices_num,\n\ + int batch\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int2 coord_idx = (int2)(gidy, 0);\n\ + int4 coord_in = (int4)(gidx, 0, 0, 0);\n\ + int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);\n\ + for(; coord_idx.y < batch;)\n\ + {\n\ + int4 indice = read_imagei(input1, coord_idx);\n\ + coord_idx.y++;\n\ + coord_in.y = gidz * axis_num + indice.x;\n\ +\n\ + float4 data = read_imagef(input0, coord_in);\n\ + coord_in.z++;\n\ + write_imagef(output, coord, data);\n\ + coord.z++;\n\ + }\n\ +}\n\ +"; /* end of gather_batch_cl*/ + static const char gather_nd_cl[] = "__kernel void gather_nd_U8toU8_1D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ @@ -52908,6 +55897,48 @@ __kernel void moments_axis0_I32toF32(\n\ int2 coord_out = (int2)(gidy, gidz);\n\ write_imagef(output_mean, coord_out, mean);\n\ write_imagef(output_vari, coord_out, vari);\n\ +}\n\ +\n\ +__kernel void moments_axis0_BF16toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output_mean,\n\ + __write_only image2d_t output_vari,\n\ + int axis,\n\ + int axis_num,\n\ + int input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + float dimRatio\n\ + )\n\ +{\n\ + int gidy = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ +\n\ + int4 coord0 = (int4)(0, gidy, gidz, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + for(coord0.x = 0; coord0.x < width;)\n\ + {\n\ + uint4 src0 = read_imageui(input, coord0);\n\ + src0 = src0 << 16;\n\ + _viv_asm(COPY, data, src0, 16);\n\ + coord0.x++;\n\ +\n\ + sum = sum + data.x;\n\ + sqr = sqr + data.x * data.x;\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + int2 coord_out = (int2)(gidy, gidz);\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ }"; /* end of moments_axis0_cl*/ static const char moments_axis01_cl[] = "__kernel void moments_axis01_U8toF32(\n\ @@ -53084,6 +56115,66 @@ __kernel void moments_axis01_I32toF32(\n\ write_imagef(output_vari, coord_out, vari);\n\ }\n\ }\n\ +\n\ +__kernel void moments_axis01_BF16toF32(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num, int input_zp, float input_scale,\n\ + int width, int height, int chn, float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ + {\n\ + float tmpSum = 0, tmpSqr = 0;\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + uint4 src0 = read_imageui(input, coord);\n\ + src0 = src0 << 16;\n\ + _viv_asm(COPY, data, src0, 16);\n\ + coord.y++;\n\ +\n\ + tmpSum = tmpSum + data.x;\n\ + tmpSqr = tmpSqr + data.x * data.x;\n\ + }\n\ + sqr += tmpSqr;\n\ + sum += tmpSum;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(gidz, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ + }\n\ +}\n\ "; /* end of moments_axis01_cl*/ static const char moments_axis012_cl[] = "__kernel void moments_axis012_U8toF32(\n\ @@ -53265,6 +56356,67 @@ __kernel void moments_axis012_I32toF32(\n\ write_imagef(output_vari, coord_out, vari);\n\ }\n\ }\n\ +\n\ +__kernel void moments_axis012_BF16toF32(\n\ + image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ + int axis, int axis_num, int input_zp, float input_scale,\n\ + int width, int height, int chn, float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, 0, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + for(coord.z = 0; coord.z < chn; coord.z++)\n\ + {\n\ + for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ + {\n\ + float tmpSum = 0, tmpSqr = 0;\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + uint4 src0 = read_imageui(input, coord);\n\ + src0 = src0 << 16;\n\ + _viv_asm(COPY, data, src0, 16);\n\ + coord.y++;\n\ + tmpSum = tmpSum + data.x;\n\ + tmpSqr = tmpSqr + data.x * data.x;\n\ + }\n\ + sqr += tmpSqr;\n\ + sum += tmpSum;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int2 coord_out = (int2)(0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ + }\n\ +}\n\ "; /* end of moments_axis012_cl*/ static const char moments_axis1_cl[] = "__kernel void moments_axis1_U8toF32(\n\ @@ -53378,6 +56530,47 @@ __kernel void moments_axis1_I32toF32(\n\ int2 coord_out = (int2)(gidx, gidz);\n\ write_imagef(output_mean, coord_out, mean);\n\ write_imagef(output_vari, coord_out, vari);\n\ +}\n\ +\n\ +__kernel void moments_axis1_BF16toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output_mean,\n\ + __write_only image2d_t output_vari,\n\ + int axis,\n\ + int axis_num,\n\ + int input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ +\n\ + int4 coord0 = (int4)(gidx, 0, gidz, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + for(coord0.y = 0; coord0.y < height;)\n\ + {\n\ + uint4 src0 = read_imageui(input, coord0);\n\ + src0 = src0 << 16;\n\ + _viv_asm(COPY, data, src0, 16);\n\ + coord0.y++;\n\ + sum = sum + data.x;\n\ + sqr = sqr + data.x * data.x;\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidz);\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ }"; /* end of moments_axis1_cl*/ static const char moments_axis2_cl[] = "__kernel void moments_axis2_U8toF32(\n\ @@ -53505,7 +56698,50 @@ __kernel void moments_axis2_I32toF32(\n\ int2 coord_out = (int2)(gidx, gidy);\n\ write_imagef(output_mean, coord_out, mean);\n\ write_imagef(output_vari, coord_out, vari);\n\ -}"; /* end of moments_axis2_cl*/ +}\n\ +\n\ +__kernel void moments_axis2_BF16toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output_mean,\n\ + __write_only image2d_t output_vari,\n\ + int axis,\n\ + int axis_num,\n\ + int input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + float dimRatio\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + int4 coord0 = (int4)(gidx, gidy, 0, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + for(coord0.z = 0; coord0.z < chn;)\n\ + {\n\ + uint4 src0 = read_imageui(input, coord0);\n\ + src0 = src0 << 16;\n\ + _viv_asm(COPY, data, src0, 16);\n\ + coord0.z++;\n\ +\n\ + sum = sum + data.x;\n\ + sqr = sqr + data.x * data.x;\n\ + }\n\ +\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + int2 coord_out = (int2)(gidx, gidy);\n\ + write_imagef(output_mean, coord_out, mean);\n\ + write_imagef(output_vari, coord_out, vari);\n\ +}\n\ +"; /* end of moments_axis2_cl*/ static const char one_hot_cl[] = "__kernel void one_hot_F32toF32\n\ (\n\ @@ -57468,6 +60704,259 @@ TILE_2D(F32, F32, float4, read_imagef, write_imagef)\n\ \n\ "; /* end of tile_cl*/ +static const char topk_cl[] = "#define TOPK_F32(LOCAL_SIZE0, STAGES) \\\n\ +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toF32_I32 \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t indices, \\\n\ + int num_stages, \\\n\ + int width \\\n\ + ) \\\n\ + { \\\n\ + uint local_id = get_local_id(0); \\\n\ + uint work_group_size = get_local_size(0); \\\n\ + uint offset = 0; \\\n\ + \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + __local float local_data[128]; \\\n\ + __local uint local_indices[128]; \\\n\ + \\\n\ + float left = read_imagef(input, coord.xy).x; \\\n\ + coord.z += work_group_size; \\\n\ + float data = read_imagef(input, coord.zy).x; \\\n\ + float right = coord.z < width ? data : -2147483647.0f; \\\n\ + \\\n\ + local_data[local_id] = left; \\\n\ + local_indices[local_id] = local_id; \\\n\ + local_data[local_id + work_group_size] = right; \\\n\ + local_indices[local_id + work_group_size] = local_id + work_group_size; \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\ + { \\\n\ + uint signo = (local_id >> stage) & 1; \\\n\ + \\\n\ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ + { \\\n\ + uint postShift = (stage - passOfStage); \\\n\ + uint pairDistance = 1 << postShift; \\\n\ + \\\n\ + uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \\\n\ + uint right_id = left_id + pairDistance; \\\n\ + \\\n\ + uint left_idx = local_indices[left_id]; \\\n\ + uint right_idx = local_indices[right_id]; \\\n\ + \\\n\ + float left_elem = local_data[left_id]; \\\n\ + float right_elem = local_data[right_id]; \\\n\ + \\\n\ + if ((left_elem < right_elem) ^ signo) \\\n\ + { \\\n\ + local_data[left_id] = right_elem; \\\n\ + local_data[right_id] = left_elem; \\\n\ + \\\n\ + local_indices[left_id] = right_idx; \\\n\ + local_indices[right_id] = left_idx; \\\n\ + } \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + float4 dst; \\\n\ + dst.x = local_data[local_id]; \\\n\ + dst.y = local_data[local_id + work_group_size]; \\\n\ + \\\n\ + write_imagef(output, coord.xy, dst.xxxx); \\\n\ + write_imagef(output, coord.zy, dst.yyyy); \\\n\ + \\\n\ + int4 index; \\\n\ + index.x = ((int*)local_indices)[local_id]; \\\n\ + index.y = ((int*)local_indices)[local_id + work_group_size]; \\\n\ + \\\n\ + write_imagei(indices, coord.xy, index.xxxx); \\\n\ + write_imagei(indices, coord.zy, index.yyyy); \\\n\ + }\n\ +TOPK_F32(1 << 0, 0)\n\ +TOPK_F32(1 << 1, 1)\n\ +TOPK_F32(1 << 2, 2)\n\ +TOPK_F32(1 << 3, 3)\n\ +TOPK_F32(1 << 4, 4)\n\ +TOPK_F32(1 << 5, 5)\n\ +TOPK_F32(1 << 6, 6)\n\ +\n\ +#define TOPK_U32(LOCAL_SIZE0, STAGES) \\\n\ +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t indices, \\\n\ + int num_stages, \\\n\ + int width \\\n\ + ) \\\n\ + { \\\n\ + uint local_id = get_local_id(0); \\\n\ + uint work_group_size = get_local_size(0); \\\n\ + uint offset = 0; \\\n\ + \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + __local uint local_data[128]; \\\n\ + __local uint local_indices[128]; \\\n\ + \\\n\ + uint left = read_imageui(input, coord.xy).x; \\\n\ + coord.z += work_group_size; \\\n\ + uint data = read_imageui(input, coord.zy).x; \\\n\ + uint right = coord.z < width ? data : 0; \\\n\ + \\\n\ + local_data[local_id] = left; \\\n\ + local_indices[local_id] = local_id; \\\n\ + local_data[local_id + work_group_size] = right; \\\n\ + local_indices[local_id + work_group_size] = local_id + work_group_size; \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\ + { \\\n\ + uint signo = (local_id >> stage) & 1; \\\n\ + \\\n\ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ + { \\\n\ + uint postShift = (stage - passOfStage); \\\n\ + uint pairDistance = 1 << postShift; \\\n\ + \\\n\ + uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \\\n\ + uint right_id = left_id + pairDistance; \\\n\ + \\\n\ + uint left_idx = local_indices[left_id]; \\\n\ + uint right_idx = local_indices[right_id]; \\\n\ + \\\n\ + uint left_elem = local_data[left_id]; \\\n\ + uint right_elem = local_data[right_id]; \\\n\ + \\\n\ + if ((left_elem < right_elem) ^ signo) \\\n\ + { \\\n\ + local_data[left_id] = right_elem; \\\n\ + local_data[right_id] = left_elem; \\\n\ + \\\n\ + local_indices[left_id] = right_idx; \\\n\ + local_indices[right_id] = left_idx; \\\n\ + } \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + uint4 dst; \\\n\ + dst.x = local_data[local_id]; \\\n\ + dst.y = local_data[local_id + work_group_size]; \\\n\ + \\\n\ + write_imageui(output, coord.xy, dst.xxxx); \\\n\ + write_imageui(output, coord.zy, dst.yyyy); \\\n\ + \\\n\ + int4 index; \\\n\ + index.x = ((int*)local_indices)[local_id]; \\\n\ + index.y = ((int*)local_indices)[local_id + work_group_size]; \\\n\ + \\\n\ + write_imagei(indices, coord.xy, index.xxxx); \\\n\ + write_imagei(indices, coord.zy, index.yyyy); \\\n\ + }\n\ +TOPK_U32(1 << 0, 0)\n\ +TOPK_U32(1 << 1, 1)\n\ +TOPK_U32(1 << 2, 2)\n\ +TOPK_U32(1 << 3, 3)\n\ +TOPK_U32(1 << 4, 4)\n\ +TOPK_U32(1 << 5, 5)\n\ +TOPK_U32(1 << 6, 6)\n\ +\n\ +#define TOPK_I32(LOCAL_SIZE0, STAGES) \\\n\ +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t indices, \\\n\ + int num_stages, \\\n\ + int width \\\n\ + ) \\\n\ + { \\\n\ + int local_id = get_local_id(0); \\\n\ + int work_group_size = get_local_size(0); \\\n\ + int offset = 0; \\\n\ + \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + __local int local_data[128]; \\\n\ + __local int local_indices[128]; \\\n\ + \\\n\ + int left = read_imagei(input, coord.xy).x; \\\n\ + coord.z += work_group_size; \\\n\ + int data = read_imagei(input, coord.zy).x; \\\n\ + int right = coord.z < width ? data : -2147483647; \\\n\ + \\\n\ + local_data[local_id] = left; \\\n\ + local_indices[local_id] = local_id; \\\n\ + local_data[local_id + work_group_size] = right; \\\n\ + local_indices[local_id + work_group_size] = local_id + work_group_size; \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + for (int stage = 0; stage < num_stages + 1; ++stage) \\\n\ + { \\\n\ + int signo = (local_id >> stage) & 1; \\\n\ + \\\n\ + for (int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ + { \\\n\ + int postShift = (stage - passOfStage); \\\n\ + int pairDistance = 1 << postShift; \\\n\ + \\\n\ + int left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \\\n\ + int right_id = left_id + pairDistance; \\\n\ + \\\n\ + int left_idx = local_indices[left_id]; \\\n\ + int right_idx = local_indices[right_id]; \\\n\ + \\\n\ + int left_elem = local_data[left_id]; \\\n\ + int right_elem = local_data[right_id]; \\\n\ + \\\n\ + if ((left_elem < right_elem) ^ signo) \\\n\ + { \\\n\ + local_data[left_id] = right_elem; \\\n\ + local_data[right_id] = left_elem; \\\n\ + \\\n\ + local_indices[left_id] = right_idx; \\\n\ + local_indices[right_id] = left_idx; \\\n\ + } \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + int4 dst; \\\n\ + dst.x = local_data[local_id]; \\\n\ + dst.y = local_data[local_id + work_group_size]; \\\n\ + \\\n\ + write_imagei(output, coord.xy, dst.xxxx); \\\n\ + write_imagei(output, coord.zy, dst.yyyy); \\\n\ + \\\n\ + int4 index; \\\n\ + index.x = ((int*)local_indices)[local_id]; \\\n\ + index.y = ((int*)local_indices)[local_id + work_group_size]; \\\n\ + \\\n\ + write_imagei(indices, coord.xy, index.xxxx); \\\n\ + write_imagei(indices, coord.zy, index.yyyy); \\\n\ + }\n\ +TOPK_I32(1 << 0, 0)\n\ +TOPK_I32(1 << 1, 1)\n\ +TOPK_I32(1 << 2, 2)\n\ +TOPK_I32(1 << 3, 3)\n\ +TOPK_I32(1 << 4, 4)\n\ +TOPK_I32(1 << 5, 5)\n\ +TOPK_I32(1 << 6, 6)\n\ +"; /* end of topk_cl*/ + static const char upsample_cl[] = "\n\ #define UPSAMPLE_PROCESS(data_type, read_fun, write_fun) \\\n\ data_type src = 0; \\\n\ @@ -57701,6 +61190,9 @@ static const source_map_t evis_resource[] = {"clip_U8_vx", clip_U8_vx}, {"conv1d_ovxlib_vx", conv1d_ovxlib_vx}, {"conv1d_ovxlib_k1024_vx", conv1d_ovxlib_k1024_vx}, + {"custom_softmax_vx", custom_softmax_vx}, + {"custom_warp_affine_vx", custom_warp_affine_vx}, + {"custom_warp_perspective_vx", custom_warp_perspective_vx}, {"depth2space_crd_vx", depth2space_crd_vx}, {"depthwise_conv1d_src0_vx", depthwise_conv1d_src0_vx}, {"depthwise_conv1d_src1_vx", depthwise_conv1d_src1_vx}, @@ -57714,7 +61206,9 @@ static const source_map_t evis_resource[] = {"floordiv_vx", floordiv_vx}, {"gather_vx", gather_vx}, {"gather_array_vx", gather_array_vx}, + {"gather_batch_vx", gather_batch_vx}, {"gather_mix_vx", gather_mix_vx}, + {"gather_mix_batch_vx", gather_mix_batch_vx}, {"gather_nd_vx", gather_nd_vx}, {"gather_nd_2d_vx", gather_nd_2d_vx}, {"gather_nd_2d_mix_vx", gather_nd_2d_mix_vx}, @@ -57785,6 +61279,7 @@ static const source_map_t evis_resource[] = {"lstmunit_activation_SP_U8_vx", lstmunit_activation_SP_U8_vx}, {"lstmunit_activation_S_F16_vx", lstmunit_activation_S_F16_vx}, {"lstmunit_activation_S_U8_vx", lstmunit_activation_S_U8_vx}, + {"matrixmul_bf16_vx", matrixmul_bf16_vx}, {"matrixmul_f16_vx", matrixmul_f16_vx}, {"matrixmul_f16f16_u8_vx", matrixmul_f16f16_u8_vx}, {"matrixmul_f16i16_i16_vx", matrixmul_f16i16_i16_vx}, @@ -57877,7 +61372,8 @@ static const source_map_t evis_resource[] = {"resize_bilinear_I16_vx", resize_bilinear_I16_vx}, {"resize_bilinear_I8_vx", resize_bilinear_I8_vx}, {"resize_bilinear_U8_vx", resize_bilinear_U8_vx}, - {"resize_bilinear_U8_half_pixel_centers_vx", resize_bilinear_U8_half_pixel_centers_vx}, + {"resize_bilinear_U8_half_pixel_centers_1_vx", resize_bilinear_U8_half_pixel_centers_1_vx}, + {"resize_bilinear_U8_half_pixel_centers_2_vx", resize_bilinear_U8_half_pixel_centers_2_vx}, {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx}, {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx}, {"resize_nearest_vx", resize_nearest_vx}, @@ -57916,14 +61412,17 @@ static const source_map_t cl_resource[] = {"argmin_axis2_cl", argmin_axis2_cl}, {"batchnorm_single_cl", batchnorm_single_cl}, {"cast_cl", cast_cl}, + {"clip_BF16_cl", clip_BF16_cl}, {"clip_F32_cl", clip_F32_cl}, {"clip_U8_cl", clip_U8_cl}, + {"depth2space_crd_cl", depth2space_crd_cl}, {"detect_post_box_cl", detect_post_box_cl}, {"eltwise_ops_helper_cl", eltwise_ops_helper_cl}, {"eltwise_unary_cl", eltwise_unary_cl}, {"erf_cl", erf_cl}, {"floordiv_cl", floordiv_cl}, {"gather_cl", gather_cl}, + {"gather_batch_cl", gather_batch_cl}, {"gather_nd_cl", gather_nd_cl}, {"gather_nd_3d_cl", gather_nd_3d_cl}, {"group_normalization_f32_cl", group_normalization_f32_cl}, @@ -58015,6 +61514,7 @@ static const source_map_t cl_resource[] = {"space2depth_internal_cl", space2depth_internal_cl}, {"swish_cl", swish_cl}, {"tile_cl", tile_cl}, + {"topk_cl", topk_cl}, {"upsample_cl", upsample_cl}, }; diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c index cffc314..69f987a 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c @@ -49,7 +49,7 @@ uint8_t * vsi_nn_LoadBinarySource buf = NULL; - fp = fopen( (char *)file, "rb" ); + fp = vsi_nn_fopen( (char *)file, "rb" ); VSILOGI( "Loading program from binary file." ); if( NULL == fp ) @@ -234,11 +234,13 @@ static vsi_status vsi_nn_RegisterVXKernel if(evis == VSI_NN_HW_EVIS_NONE) { // set default evis version is 2 - sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); + snprintf(cmd, MAX_BUILDPROGRAM_LEN, + "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); } else { - sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); + snprintf(cmd, MAX_BUILDPROGRAM_LEN, + "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); } status = vxBuildProgram(program, cmd); @@ -319,14 +321,16 @@ static vsi_status vsi_nn_RegisterBinKernel if(evis == VSI_NN_HW_EVIS_NONE) { // set default evis version is 2 - sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); + snprintf(cmd, MAX_BUILDPROGRAM_LEN, + "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); } else { - sprintf(cmd, "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); + snprintf(cmd, MAX_BUILDPROGRAM_LEN, + "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); } #else - sprintf(cmd, "-cl-viv-vx-extension"); + snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension"); #endif status = vxBuildProgram(program, cmd); @@ -530,7 +534,7 @@ void vsi_nn_VxResourceSetPath char* path ) { - strncpy(s_vx_resource_path, path, VSI_NN_MAX_PATH - 1); + vsi_nn_strncpy(s_vx_resource_path, path, VSI_NN_MAX_PATH - 1); } /* vsi_nn_VxResourceSetPath() */ const uint8_t * vsi_nn_VxBinResourceGetResource diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c index 1ce386a..f1141ba 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c @@ -51,6 +51,7 @@ static vsi_status op_compute { status = VSI_FAILURE; } + self->n = (vx_node)n; return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c index 70ff65e..06d439b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c @@ -131,7 +131,8 @@ static vsi_status _static_batchnorm ) { vsi_status status; - vx_tensor vx_input,vx_output; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_tensor_t* reshape_tensors[6] = { NULL }; status = VSI_FAILURE; status = _try_set_high_presision_tensor(inputs); @@ -142,29 +143,35 @@ static vsi_status _static_batchnorm } if(_is_3d_batchnorm(self, inputs)) { - vx_input = self->nn_param.batch_norm.local->reshaped_input->t; - vx_output = self->nn_param.batch_norm.local->reshaped_output->t; + reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input; + reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output; } else { - vx_input = inputs[0]->t; - vx_output = outputs[0]->t; + reshape_tensors[0] = inputs[0]; + reshape_tensors[5] = outputs[0]; } - self->n = vxBatchNormalizationLayer( - self->graph->g, - self->nn_param.batch_norm.eps, - inputs[1]->t, - inputs[2]->t, - inputs[3]->t, - inputs[4]->t, - vx_input, - vx_output - ); - if( NULL == self->n ) + reshape_tensors[1] = inputs[1]; + reshape_tensors[2] = inputs[2]; + reshape_tensors[3] = inputs[3]; + reshape_tensors[4] = inputs[4]; + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_float32( param, "eps", self->nn_param.batch_norm.eps ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "batch_norm", + reshape_tensors, 5, + &reshape_tensors[5], 1, param ); + + if( self->n ) { - status = VSI_FAILURE; + status = VSI_SUCCESS; } + + vsi_nn_kernel_param_release( ¶m ); + return status; } @@ -439,7 +446,6 @@ static vsi_bool op_check } } /* op_check() */ - static vsi_bool op_setup ( vsi_nn_node_t * self, @@ -492,7 +498,6 @@ static vsi_status op_deinit return VSI_SUCCESS; } - #ifdef __cplusplus extern "C" { #endif @@ -512,4 +517,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c index 4399d22..87aa2ba 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c @@ -379,7 +379,7 @@ static vsi_status op_deinit ) { vsi_status status = VSI_SUCCESS; - + vsi_nn_internal_deinit_node_wksp( self ); return status; } /* op_deinit() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index fec61bb..be82720 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -219,7 +219,7 @@ static vsi_bool op_check IO_TYPE(D_I16, D_U8) IO_TYPE(D_I8|Q_DFP, D_F32) IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I32) + IO_TYPE(D_I8|Q_DFP, D_I32|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_U32) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_ASYM) @@ -247,7 +247,7 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_I32|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U32) IO_TYPE(D_U8|Q_ASYM, D_F32) IO_TYPE(D_U8, D_U8|Q_ASYM) @@ -286,6 +286,8 @@ static vsi_bool op_check IO_TYPE(D_U32, D_U16) IO_TYPE(D_U32, D_U8|Q_ASYM) IO_TYPE(D_U32, D_U8) + IO_TYPE(D_BF16, D_I32) + IO_TYPE(D_I32, D_BF16) /* HW 9.0.1 */ IO_TYPE(D_I8|Q_DFP, D_BF16) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c new file mode 100644 index 0000000..7048f51 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c @@ -0,0 +1,302 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _deconv3d_local_data_t { + int32_t placeholder; +} deconv3d_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +#define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\ + (( in - 1 ) * stride + ksize - pad_1 - pad_2 + output_padding) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + + // Create kernel param + vsi_nn_kernel_param_t * param; + //vsi_nn_kernel_node_t n; + param = vsi_nn_kernel_param_create(); + + // Add params +#define MAP_PARAM(type_name, value) {\ + vsi_nn_kernel_param_add_int32( param, type_name, value); \ + } + + MAP_PARAM("stride_w",self->nn_param.deconv3d.stride[0]); + MAP_PARAM("stride_h",self->nn_param.deconv3d.stride[1]); + MAP_PARAM("stride_d",self->nn_param.deconv3d.stride[2]); + + MAP_PARAM("outpadding_w",self->nn_param.deconv3d.output_padding[0]); + MAP_PARAM("outpadding_h",self->nn_param.deconv3d.output_padding[1]); + MAP_PARAM("outpadding_d",self->nn_param.deconv3d.output_padding[2]); + + MAP_PARAM("pad_left",self->nn_param.deconv3d.pad[0]); + MAP_PARAM("pad_right",self->nn_param.deconv3d.pad[1]); + MAP_PARAM("pad_top",self->nn_param.deconv3d.pad[2]); + MAP_PARAM("pad_bottom",self->nn_param.deconv3d.pad[3]); + MAP_PARAM("pad_front",self->nn_param.deconv3d.pad[4]); + MAP_PARAM("pad_end",self->nn_param.deconv3d.pad[5]); + + MAP_PARAM("weights",self->nn_param.deconv3d.weights); + MAP_PARAM("group",self->nn_param.deconv3d.group); + + MAP_PARAM("overflow_policy",self->vx_param.overflow_policy); + MAP_PARAM("rounding_policy",self->vx_param.rounding_policy); + MAP_PARAM("down_scale_size_rounding",self->vx_param.down_scale_size_rounding); + +#undef MAP_PARAM + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "deconv3d", + inputs, 3, outputs, 1, param ); + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_DECONVOLUTION, self, inputs, outputs); + + return ret; +} /* op_check() */ + +void _rotate_weight_data( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * weights) +{ + vsi_ssize_t oc = 0, ic = 0; + uint8_t* weight_data = NULL; + uint8_t* buffer = NULL; + vsi_ssize_t kernel_size_w = weights->attr.size[0]; + vsi_ssize_t kernel_size_h = weights->attr.size[1]; + vsi_ssize_t kernel_size_d = weights->attr.size[2]; + vsi_ssize_t weight_ic = weights->attr.size[3]; + vsi_ssize_t weight_oc = weights->attr.size[4]; + vsi_ssize_t slice_size = kernel_size_w * kernel_size_h; + vsi_ssize_t depth_size = slice_size * kernel_size_d; + int32_t item_size = vsi_nn_TypeGetBytes(weights->attr.dtype.vx_type); + + weight_data = vsi_nn_ConvertTensorToData(graph, weights); + buffer = (uint8_t*)malloc(item_size * depth_size * weight_ic * weight_oc); + memset(buffer, 0x00, item_size * depth_size * weight_ic * weight_oc); + //memcpy(buffer, weight_data, item_size * slice_size * weight_ic * weight_oc); + for(oc = 0; oc < weight_oc; oc++) + { + for(ic = 0; ic < weight_ic; ic++) + { + vsi_ssize_t d, h, w; + vsi_ssize_t offset = item_size * depth_size * (oc * weight_ic + ic); + for(d = 0; d < kernel_size_d; d++) + { + uint8_t *src_depth = weight_data + offset + (kernel_size_d - d - 1) * item_size * slice_size; + uint8_t *dst_depth = buffer + offset + d * item_size * slice_size; + for(h = 0; h < kernel_size_h; h ++) + { + uint8_t *dst_height = dst_depth + h * kernel_size_w * item_size; + uint8_t *src_height = src_depth + (kernel_size_h - 1 - h) * kernel_size_w * item_size; + for(w = 0; w < kernel_size_w; w++) + { + memcpy(dst_height + w * item_size, + src_height + (kernel_size_w - 1 - w) * item_size, + item_size); + } + } + } + } + } + + vsi_nn_CopyDataToTensor( graph, weights, buffer ); + vsi_nn_Free( buffer ); + vsi_nn_safe_free( weight_data ); +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_deconv3d_param *nn_param; + + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } + + /* Rotate 180 degrees for weights data */ + if (TRUE == inputs[1]->attr.is_const) + { + _rotate_weight_data(self->graph, inputs[1]); + } + else + { + VSILOGE("deconv3d: do not support dynamic weight"); + } + + nn_param = &self->nn_param.deconv3d; + + nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group; + nn_param->ksize[0] = (uint32_t)inputs[1]->attr.size[0]; + nn_param->ksize[1] = (uint32_t)inputs[1]->attr.size[1]; + nn_param->ksize[2] = (uint32_t)inputs[1]->attr.size[2]; + + if(nn_param->group != 1) + { + VSILOGE("deconv3d: only support group == 1, but group is %d", nn_param->group); + return FALSE; + } + + if(nn_param->ksize[2] < nn_param->stride[2]) + { + VSILOGE("deconv3d: only support kernel_depth < stride_depth,but \ + kernel_depth = %d, stried_depth = %d", nn_param->ksize[2], nn_param->stride[2]); + return FALSE; + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = COMPUTE_DECONV_SZ( + inputs[0]->attr.size[0], + nn_param->ksize[0], + nn_param->pad[0], + nn_param->pad[1], + nn_param->stride[0], + nn_param->output_padding[0] + ); + + outputs[0]->attr.size[1] = COMPUTE_DECONV_SZ( + inputs[0]->attr.size[1], + nn_param->ksize[1], + nn_param->pad[2], + nn_param->pad[3], + nn_param->stride[1], + nn_param->output_padding[1] + ); + outputs[0]->attr.size[2] = COMPUTE_DECONV_SZ( + inputs[0]->attr.size[2], + nn_param->ksize[2], + nn_param->pad[4], + nn_param->pad[5], + nn_param->stride[2], + nn_param->output_padding[2] + ); + if(self->nn_param.deconv3d.weights > 0) + { + outputs[0]->attr.size[3] = self->nn_param.deconv3d.weights; + } + else + { + outputs[0]->attr.size[3] = inputs[1]->attr.size[3]; + } + outputs[0]->attr.size[4] = inputs[0]->attr.size[4]; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + /* TODO + //self->nn_param.deconv3d.local = \ + // (deconv3d_local_data_t*)malloc(sizeof(deconv3d_local_data_t)); + */ + + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_op_common_deinit(self); + + /* TODO + //vsi_nn_safe_free(self->nn_param.deconv3d.local); + */ + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ DECONV3D, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS \ No newline at end of file diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c index d9de8b9..1f39eb7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c @@ -114,6 +114,8 @@ static vsi_bool op_check IO_TYPE(D_I8|Q_DFP, D_F16) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) END_IO_TYPE_DECL(DEPTH2SPACE_INTERNAL) if(!VALIDATE_OP_IO_TYPES(DEPTH2SPACE_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c index 2373688..19a5303 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -198,32 +198,30 @@ static vsi_bool op_check_minimum { /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(MINIMUM, 2, 1) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - - IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) - IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) - IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) - - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_SYM) END_IO_TYPE_DECL(MINIMUM) if(!VALIDATE_OP_IO_TYPES(MINIMUM, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, @@ -245,32 +243,30 @@ static vsi_bool op_check_maximum { /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(MAXIMUM, 2, 1) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - - IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) - IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) - IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) - - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_SYM) END_IO_TYPE_DECL(MAXIMUM) if(!VALIDATE_OP_IO_TYPES(MAXIMUM, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c index d8ae9d9..a3a054e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -104,11 +104,11 @@ static vsi_bool op_setup out_rank = inputs[0]->attr.dim_num; - for(i = 0; i < out_rank; i++) + for (i = 0; i < out_rank; i++) { shape[i] = inputs[0]->attr.size[i]; } - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = out_rank; memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); @@ -120,7 +120,7 @@ static vsi_bool op_setup total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, outputs[0]->attr.dim_num ); - if( total_size_expected != total_size_got ) + if ( total_size_expected != total_size_got ) { VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", total_size_expected, total_size_got); @@ -225,6 +225,7 @@ DEF_OP_REG(name, op_init_##kernel_name, op_compute_##kernel_name, \ vsi_nn_op_common_deinit, op_check, op_setup, NULL, 1, 1) DEF_ELEMENT_WISE_UNARY_OP( SIN, sin ); +DEF_ELEMENT_WISE_UNARY_OP( COS, cos ); DEF_ELEMENT_WISE_UNARY_OP( EXP, exp ); DEF_ELEMENT_WISE_UNARY_OP( LOG, log ); DEF_ELEMENT_WISE_UNARY_OP( ELU, elu ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c index 325e9c1..0c57380 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c @@ -117,6 +117,12 @@ static vsi_bool op_check IO_TYPE(D_I32, D_I16|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16, D_I32, D_I32) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I32, D_I16|Q_DFP) END_IO_TYPE_DECL(FLOORDIV) if (!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c index cf19eeb..6c1bdc2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c @@ -165,6 +165,20 @@ static vsi_bool op_check /* NN Support - F32 */ IO_TYPE(D_F32, D_BF16, D_F32, D_F32) IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + END_IO_TYPE_DECL(FCL_RELU) ret = VALIDATE_OP_IO_TYPES(FCL_RELU, self, inputs, self->input.num, outputs, self->output.num); @@ -347,4 +361,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c index 8120757..34bcd78 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -51,6 +51,7 @@ static vsi_status op_compute uint32_t i = 0; vsi_size_t block_size = 1, block_num = 1, axis_num = 0, indices_num = 1; int32_t axis = self->nn_param.gather.axis; + int32_t batch_dims = self->nn_param.gather.batch_dims; vsi_size_t *input_size = inputs[0]->attr.size; uint32_t dims_num = inputs[0]->attr.dim_num; @@ -62,11 +63,11 @@ static vsi_status op_compute } axis_num = input_size[axis]; - for(i = axis + 1; i < dims_num; ++i) + for(i = axis + 1; i < dims_num - batch_dims; ++i) { block_num *= input_size[i]; } - for(i = 0; i < (uint32_t)inputs[1]->attr.dim_num; ++i) + for(i = 0; i < (uint32_t)inputs[1]->attr.dim_num - batch_dims; ++i) { indices_num *= inputs[1]->attr.size[i]; } @@ -76,6 +77,7 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "axis_num", (int32_t)axis_num ); vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis ); vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num ); + vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims ); n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param ); if( n != NULL ) { @@ -125,6 +127,18 @@ static vsi_bool op_check return TRUE; } /* op_check() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.gather.batch_dims = 0; + + return status; +} /* op_init() */ + static vsi_bool op_setup ( vsi_nn_node_t * self, @@ -186,7 +200,7 @@ extern "C" { DEF_OP_REG ( /* op_name */ GATHER, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c index 28a490c..6890763 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c @@ -196,6 +196,12 @@ static vsi_status op_compute VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); return VSI_FAILURE; } + else + { + // no need to maintain self->n + vxReleaseNode( &self->n ); + self->n = NULL; + } } return VSI_SUCCESS; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c index ed652c3..cdead0c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c @@ -37,6 +37,7 @@ #include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) @@ -72,18 +73,46 @@ static vsi_status _try_set_high_presision_tensor return status; } -static vsi_bool _is_3d_instance_norm +static void vsi_nn_optimize_instance_norm_shape ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs + const vsi_size_t* shape_x, const vsi_size_t rank_x, + vsi_size_t* out_shape_x, vsi_size_t* out_rank_x ) { - if( 3 == inputs[0]->attr.dim_num ) + vsi_size_t rank = rank_x; + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { {0} }; + + if (rank_x > 4) { - return TRUE; + memcpy(shape[0], shape_x, (rank_x - 2) * sizeof(vsi_size_t)); + + vsi_nn_kernel_optimize_element_shape(shape[0], rank_x - 2, shape[1], &rank); } - return FALSE; -} /* _is_3d_instance_norm() */ + + if (rank_x == 3) + { + out_shape_x[0] = shape_x[0]; + out_shape_x[1] = 1; + out_shape_x[2] = shape_x[1]; + out_shape_x[3] = shape_x[2]; + + *out_rank_x = 4; + } + /****reshape [n, c, d0, d1, ..., dn] to [n, c, h, w]***/ + else if (rank_x > 4 && rank == 2) + { + memcpy(out_shape_x, shape[1], 2 * sizeof(vsi_size_t)); + memcpy(&out_shape_x[2], &shape_x[rank_x - 2], 2 * sizeof(vsi_size_t)); + + *out_rank_x = 4; + } + else + { + memcpy(out_shape_x, shape_x, rank_x * sizeof(vsi_size_t)); + + *out_rank_x = rank_x; + } +} static vsi_status op_compute ( @@ -96,112 +125,48 @@ static vsi_status op_compute vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; float eps = self->nn_param.instancenorm.eps; - vsi_size_t *input_size = inputs[0]->attr.size; - vsi_size_t dims_num = inputs[0]->attr.dim_num; - int32_t rs_flg = 0; - vsi_nn_tensor_t * tmp_inputs[3] = {NULL, NULL, NULL}; - vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; - vsi_nn_instancenorm_lcl_data2 *local = self->nn_param.instancenorm.lcl2_data; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t new_rank = 0; + vsi_nn_tensor_t * tmp_tensors[4] = {NULL}; - status = _try_set_high_presision_tensor(inputs); + vsi_nn_optimize_instance_norm_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank); + + tmp_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], shape, new_rank ); + tmp_tensors[1] = inputs[1]; + tmp_tensors[2] = inputs[2]; + tmp_tensors[3] = vsi_nn_reshape_tensor( self->graph, + outputs[0], shape, new_rank ); + + status = _try_set_high_presision_tensor(tmp_tensors); if(status != VSI_SUCCESS) { VSILOGE("Set tensor attr of high presision fail"); return status; } - if(_is_3d_instance_norm(self, inputs)) - { - tmp_inputs[0] = local->reshaped_input; - tmp_outputs[0] = local->reshaped_output; - tmp_inputs[1] = inputs[1]; - tmp_inputs[2] = inputs[2]; - } - else - { - tmp_inputs[0] = inputs[0]; - tmp_outputs[0] = outputs[0]; - tmp_inputs[1] = inputs[1]; - tmp_inputs[2] = inputs[2]; - if((input_size[1] * input_size[2] < 65536) - && dims_num > 2) - { - rs_flg = 1; - } - } - param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_float32( param, "eps", eps ); - vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg ); + n = vsi_nn_kernel_selector( self->graph, "instance_norm", - tmp_inputs, _INPUT_NUM, tmp_outputs, _OUTPUT_NUM, param ); + tmp_tensors, _INPUT_NUM, &tmp_tensors[3], _OUTPUT_NUM, param ); if( n != NULL ) { self->n = (vx_node)n; status = VSI_SUCCESS; } - if(param != NULL) + if (param != NULL) { vsi_nn_kernel_param_release( ¶m ); } + vsi_safe_release_tensor(tmp_tensors[0]); + vsi_safe_release_tensor(tmp_tensors[3]); + return status; } /* op_compute() */ -static vsi_status op_optimize - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_opt_direction_e direction - ) -{ - uint32_t dim = 0; - vsi_nn_instancenorm_lcl_data2 *local = NULL; - vsi_size_t shape[VSI_NN_MAX_DIM_NUM]; - char tensor_name[128]; - - dim = inputs[0]->attr.dim_num; - if(_is_3d_instance_norm(self, inputs) == FALSE) - { - return VSI_SUCCESS; - } - - VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); - /* - insert a reshape node before and after 3D instance_norm - */ - shape[0] = inputs[0]->attr.size[0]; - shape[1] = 1; - shape[2] = inputs[0]->attr.size[1]; - shape[3] = inputs[0]->attr.size[2]; - dim = 4; - local = self->nn_param.instancenorm.lcl2_data; - if (VSI_NN_OPTIMIZE_FORWARD == direction) - { - /* reshape 3d input (xcn) --> 4d input (whcn) */ - local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim); - } - else - { - /* reshape 3d output(xcn) --> 4d output(whcn) */ - local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim); - if(local->reshaped_output && local->reshaped_output->t) - { - memset(tensor_name, 0, sizeof(tensor_name)); - snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid); - if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE) - { - VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid); - return VSI_FAILURE; - } - } - } - - return VSI_SUCCESS; -} /* op_optimize() */ - static vsi_bool op_check ( vsi_nn_node_t * self, @@ -241,66 +206,6 @@ static vsi_bool op_check return TRUE; } /* op_check() */ -static vsi_status op_init - ( - vsi_nn_node_t * self - ) -{ - vsi_status status = VSI_SUCCESS; - - self->nn_param.instancenorm.lcl2_data = - (vsi_nn_instancenorm_lcl_data2 *)malloc(sizeof(vsi_nn_instancenorm_lcl_data2)); - if (NULL == self->nn_param.instancenorm.lcl2_data) - { - return VX_ERROR_NO_MEMORY; - } - - memset( self->nn_param.instancenorm.lcl2_data, 0, sizeof(vsi_nn_instancenorm_lcl_data2) ); - - self->nn_param.instancenorm.lcl2_data->reshapeFlg = 0; - self->nn_param.instancenorm.lcl2_data->execute_on_sw = 0; - self->nn_param.instancenorm.lcl2_data->hash_idx = 0; - self->nn_param.instancenorm.lcl2_data->reshaped_input = NULL; - self->nn_param.instancenorm.lcl2_data->reshaped_output = NULL; - - return status; -} /* op_init() */ - -static vsi_status op_deinit - ( - vsi_nn_node_t * self - ) -{ - uint32_t i; - vsi_nn_instancenormalize_param *p = &(self->nn_param.instancenorm); - for (i = 0; i < _VSI_NN_INSTANCENORM_LOCAL_TENSOR_NUM; i++) - { - if (self->nn_param.instancenorm.local.local_tensor[i] != NULL) - { - vxReleaseTensor(&(self->nn_param.instancenorm.local.local_tensor[i])); - self->nn_param.instancenorm.local.local_tensor[i] = NULL; - } - } - if(p->lcl2_data->reshaped_input) - { - vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_input)); - p->lcl2_data->reshaped_input = NULL; - } - if(p->lcl2_data->reshaped_output) - { - vsi_nn_ReleaseTensor(&(p->lcl2_data->reshaped_output)); - p->lcl2_data->reshaped_output = NULL; - } - if(self->nn_param.instancenorm.lcl2_data) - { - free(self->nn_param.instancenorm.lcl2_data); - self->nn_param.instancenorm.lcl2_data = NULL; - } - vsi_nn_op_common_deinit(self); - - return VSI_SUCCESS; -} /* op_deinit() */ - #ifdef __cplusplus extern "C" { #endif @@ -308,12 +213,12 @@ extern "C" { DEF_OP_REG ( /* op_name */ INSTANCE_NORM, - /* init */ op_init, + /* init */ NULL, /* compute */ op_compute, - /* deinit */ op_deinit, + /* deinit */ vsi_nn_op_common_deinit, /* check */ op_check, /* setup */ vsi_nn_op_common_setup, - /* optimize */ op_optimize, + /* optimize */ NULL, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c index 8c298ae..08955f3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c @@ -45,7 +45,6 @@ #define VSI_NN_L2NORMALIZESCALE_DEFAULT_AXIS 2 - static vsi_nn_tensor_t* _expand_scale_tensor ( vsi_nn_graph_t *graph, @@ -84,7 +83,7 @@ static vsi_nn_tensor_t* _expand_scale_tensor attr.size[0] = scale_size_out; attr.size[1] = 1; attr.dim_num = 2; - out_dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; attr.vtl = FALSE; scale_tensor = vsi_nn_CreateTensor(graph, &attr); @@ -115,7 +114,6 @@ final: return scale_tensor; } - static vsi_bool _check_value_is_equal_to_one ( vsi_nn_graph_t* graph, @@ -429,4 +427,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c index 69e27a1..3e79acc 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c @@ -55,11 +55,13 @@ static vsi_status op_compute vsi_nn_kernel_param_add_float32( param, "b_v", self->nn_param.linear.b ); n = vsi_nn_kernel_selector( self->graph, "linear", inputs, 1, outputs, 1, param ); - if( n == NULL ) + if ( n == NULL ) { status = VSI_FAILURE; } + self->n = (vx_node)n; + vsi_nn_kernel_param_release( ¶m ); return status; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c index bff1972..7b2d441 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c @@ -113,6 +113,7 @@ static vsi_bool op_check BEGIN_IO_TYPE_DECL(LOGICAL_OPS, 2, 1) IO_TYPE(D_I8, D_I8, D_I8) IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8) + IO_TYPE(D_BF16, D_BF16, D_BOOL8) END_IO_TYPE_DECL(LOGICAL_OPS) if(!VALIDATE_OP_IO_TYPES(LOGICAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c index 5433281..d792d34 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c @@ -634,7 +634,7 @@ static vsi_bool op_setup { if ( p->local->use_hybrid && p->local->use_projection_bias ) { - vsi_bool use_virtual_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]->attr.vtl; + use_virtual_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]->attr.vtl; input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr, &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE); zero_bias_tensor = input_tensor->t; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c index fcf29f4..5da258f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c @@ -174,6 +174,7 @@ static vsi_bool op_check IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16, D_BF16) END_IO_TYPE_DECL(MATRIXMUL) if (!VALIDATE_OP_IO_TYPES(MATRIXMUL, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c index bd24f7d..881767e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c @@ -121,6 +121,8 @@ static vsi_bool op_check IO_TYPE(D_F16, D_F32, D_F32) IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_I32, D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32, D_F32) END_IO_TYPE_DECL(MOMENTS) if (!VALIDATE_OP_IO_TYPES(MOMENTS, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c index bddcc12..99dbd5d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c @@ -107,6 +107,7 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_BF16, D_BF16) END_IO_TYPE_DECL(ONE_HOT) if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c new file mode 100644 index 0000000..d0b89aa --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c @@ -0,0 +1,198 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _pad2_local_data_t { + int32_t placeholder; +} pad2_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static int32_t _get_vx_pad_mode(vx_enum mode) +{ + int32_t pad_mode = 0; + switch (mode) + { + case VSI_NN_PAD_MODE_CONSTANT: + pad_mode = VX_PAD_CONSTANT; + break; + case VSI_NN_PAD_MODE_REPLICATE: + pad_mode = VX_PAD_REPLICATE; + break; + case VSI_NN_PAD_MODE_SYMMETRIC: + pad_mode = VX_PAD_MIRROR_SYMMETRIC; + break; + case VSI_NN_PAD_MODE_REFLECT: + pad_mode = VX_PAD_MIRROR_REFLECT; + break; + default: + VSILOGE("Wrong pad_mode value"); + break; + } + + return pad_mode; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_pad2_param *p = &self->nn_param.pad2; + vsi_nn_kernel_param_t * param; + int32_t pad_mode = _get_vx_pad_mode(p->mode); + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_buffer( param, "front_size", (void *)p->front_size, p->dim_num ); + vsi_nn_kernel_param_add_buffer( param, "back_size", (void *)p->back_size, p->dim_num ); + vsi_nn_kernel_param_add_int32( param, "pad_mode", pad_mode ); + vsi_nn_kernel_param_add_float32( param, "const_val", p->const_val ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "pad2", + inputs, 1, outputs, 1, param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(PAD2, 1, 1) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + END_IO_TYPE_DECL(PAD2) + if (!VALIDATE_OP_IO_TYPES(PAD2, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + if (self->nn_param.pad2.dim_num != inputs[0]->attr.dim_num + && self->nn_param.pad2.dim_num != 0 ) + { + VSILOGE("Error:input tensor dim should be equal with pad's."); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i = 0; + if (self->nn_param.pad2.dim_num == 0) + { + self->nn_param.pad2.dim_num = (uint8_t)inputs[0]->attr.dim_num; + } + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) + { + for (i = 0; i < self->nn_param.pad2.dim_num; i ++) + { + uint32_t front = self->nn_param.pad2.front_size[i]; + uint32_t back = self->nn_param.pad2.back_size[i]; + outputs[0]->attr.size[i] = inputs[0]->attr.size[i] + front + back; + } + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + else + { + for (i = 0; i < self->nn_param.pad2.dim_num; i ++) + { + uint32_t front = self->nn_param.pad2.front_size[i]; + uint32_t back = self->nn_param.pad2.back_size[i]; + + if (front + back + inputs[0]->attr.size[i] != outputs[0]->attr.size[i]) + { + VSILOGE("Error:output shape[%u] not equal front padding[%u] + input shape[%u] + back padding[%u]", + outputs[0]->attr.size[i], front, back); + return FALSE; + } + } + } + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ PAD2, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index e61d9f2..ecbf5fa 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -53,6 +53,7 @@ typedef struct _vsi_nn_reduce_lcl2_data_t vsi_nn_tensor_t *axis_tensor2; int32_t axes[VSI_NN_MAX_DIM_NUM]; int32_t axes_num; + vsi_bool use_internal_node; } vsi_nn_reduce_lcl2_data_t; static vsi_status op_comput_reduce_mean(vsi_nn_node_t * self, @@ -148,7 +149,54 @@ static vsi_bool caculate_reshape_size(uint32_t* dim_value, return enable_reshape; } +static vsi_bool _check_is_sp_supported_type + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_enum type + ) +{ + int32_t * axes = self->nn_param.reduce.local2->axes; + int32_t axes_num = self->nn_param.reduce.local2->axes_num; + vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t axis_in[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t i = 0; + uint32_t axis_size = 0; + uint32_t rank_in = 0; + uint32_t rank_out = 0; + vsi_bool ret = FALSE; + if ( !self->graph->ctx->config.support_stream_processor || + (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN) ) + { + return FALSE; + } + + if ( (VSI_NN_TYPE_FLOAT64 == input->attr.dtype.vx_type) + || (VSI_NN_TYPE_UINT32 == input->attr.dtype.vx_type) + || (VSI_NN_TYPE_UINT64 == input->attr.dtype.vx_type) + ) + { + return FALSE; + } + + for (i = 0; i < axes_num; i++) + { + shapes[0][i] = input->attr.size[axes[i]]; + shapes[1][i] = 1; + axis_in[i] = i; + } + + ret = vsi_nn_kernel_optimize_reduce_shape( + shapes[0], axes_num, + axis_in, axes_num, + shapes[1], axes_num, + shapes[2], &rank_in, shapes[3], &rank_out, + new_axis, &axis_size); + + return ret && axis_size < 3; +} static vsi_status op_compute ( vsi_nn_node_t * self, @@ -158,7 +206,11 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; - if (self->nn_param.reduce.type == VSI_NN_REDUCE_MEAN) + if ( self->nn_param.reduce.local2->use_internal_node ) + { + status = vsi_nn_internal_compute_node( self ); + } + else if (self->nn_param.reduce.type == VSI_NN_REDUCE_MEAN) { vx_tensor input_t, output_t; vsi_nn_tensor_t *axis_tensor = NULL; @@ -440,16 +492,6 @@ static vsi_status op_compute input_t, output_t); } - - } - else if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM || - self->nn_param.reduce.type == VSI_NN_REDUCE_MAX || - self->nn_param.reduce.type == VSI_NN_REDUCE_MIN || - self->nn_param.reduce.type == VSI_NN_REDUCE_ALL || - self->nn_param.reduce.type == VSI_NN_REDUCE_ANY || - self->nn_param.reduce.type == VSI_NN_REDUCE_PROD) - { - status = vsi_nn_internal_compute_node( self ); } return status; @@ -463,12 +505,7 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { - if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM || - self->nn_param.reduce.type == VSI_NN_REDUCE_MAX || - self->nn_param.reduce.type == VSI_NN_REDUCE_MIN || - self->nn_param.reduce.type == VSI_NN_REDUCE_ALL || - self->nn_param.reduce.type == VSI_NN_REDUCE_ANY || - self->nn_param.reduce.type == VSI_NN_REDUCE_PROD) + if ( self->nn_param.reduce.local2->use_internal_node ) { return vsi_nn_internal_optimize_node(self, direction ); } @@ -726,7 +763,6 @@ static vsi_bool op_set_reduce_axis( (vsi_size_t*)resolved_dim2, &resolved_dim_count2 ); } - for (i = 0; i < (uint32_t)resolved_dim_count2; i++) { self->nn_param.reduce.local2->axes[i] = (int32_t)resolved_dim2[i]; @@ -736,6 +772,92 @@ static vsi_bool op_set_reduce_axis( return TRUE; } +static vsi_bool op_set_sp_reduce_internal + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_enum type_name + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* tensor1 = NULL; + vsi_nn_tensor_t* new_output = NULL; + uint32_t* permute_in_perm = NULL; + int32_t * new_axis = NULL; + vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1}; + int32_t use_virtual_tensor = TRUE; + vsi_nn_internal_node_t* tmp_inode = NULL; + int32_t * axes = self->nn_param.reduce.local2->axes; + int32_t axes_num = self->nn_param.reduce.local2->axes_num; + int32_t i = 0, j = 0, index = 0; + vsi_size_t reduce_size = 1; + + vsi_nn_internal_init_node_wksp( self ); + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); + tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 ); + permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, + inputs[0]->attr.dim_num * sizeof(uint32_t)); + + for ( i = 0; i < axes_num; i++) + { + shapes[index] = outputs[0]->attr.size[axes[i]]; + permute_in_perm[index ++] = axes[i]; + reduce_size *= inputs[0]->attr.size[axes[i]]; + } + + for ( j = 0; j < (int32_t)inputs[0]->attr.dim_num; j++) + { + for (i = 0; i < axes_num; i++) + { + if (j == axes[i]) + { + break; + } + } + if (i == axes_num) + { + shapes[index] = outputs[0]->attr.size[j]; + permute_in_perm[index ++] = j; + } + } + tmp_inode->node->nn_param.permute.perm = permute_in_perm; + tmp_inode->node->nn_param.permute.dim_num = inputs[0]->attr.dim_num; + tmp_inode->inputs[0] = inputs[0]; + tmp_inode->outputs[0] = tensor1->t; + vsi_nn_internal_setup_node(self, tmp_inode); + + new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num); + + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 ); + new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode, + axes_num * sizeof(int32_t)); + for (i = 0; i < axes_num; i++) + { + new_axis[i] = i; + } + tmp_inode->inputs[0] = tensor1->t; + tmp_inode->outputs[0] = new_output; + tmp_inode->node->nn_param.reduce_mean_internal.axis = new_axis; + tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num; + if (type_name == VSI_NN_REDUCE_SUM) + { + tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f; + } + else + { + tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f / (float)reduce_size; + } + vsi_nn_internal_setup_node(self, tmp_inode); + + self->nn_param.reduce.local2->reshaped_output = new_output; + + return TRUE; +} static vsi_bool op_set_reduce_internal ( @@ -920,7 +1042,6 @@ static vsi_bool op_set_reduce_internal curr->outputs[0] = tmp_output_tensor[1]->t; vsi_nn_internal_setup_node( self, curr ); - if (3 == axes[resolved_dim_count - 1]) { vsi_bool enable_reshape = TRUE; @@ -968,7 +1089,6 @@ static vsi_bool op_set_reduce_internal return TRUE; } - static vsi_bool op_setup ( vsi_nn_node_t * self, @@ -1063,32 +1183,43 @@ static vsi_bool op_setup reshape_out_t[0] = vsi_nn_reshape_tensor( self->graph, outputs[0], shape, new_rank ); self->nn_param.reduce.local2->reshaped_output1 = reshape_out_t[0]; - if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM) + + if (_check_is_sp_supported_type(self, reshape_in_t[0], self->nn_param.reduce.type)) { + self->nn_param.reduce.local2->use_internal_node = TRUE; + ret = op_set_sp_reduce_internal(self, reshape_in_t, reshape_out_t, self->nn_param.reduce.type); + } + else if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM) + { + self->nn_param.reduce.local2->use_internal_node = TRUE; ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCESUM_INTERNAL); } else if (self->nn_param.reduce.type == VSI_NN_REDUCE_MAX) { + self->nn_param.reduce.local2->use_internal_node = TRUE; ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEMAX_INTERNAL); } else if (self->nn_param.reduce.type == VSI_NN_REDUCE_MIN) { + self->nn_param.reduce.local2->use_internal_node = TRUE; ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEMIN_INTERNAL); } else if (self->nn_param.reduce.type == VSI_NN_REDUCE_PROD) { + self->nn_param.reduce.local2->use_internal_node = TRUE; ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEPROD_INTERNAL); } else if (self->nn_param.reduce.type == VSI_NN_REDUCE_ALL) { + self->nn_param.reduce.local2->use_internal_node = TRUE; ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEALL_INTERNAL); } else if (self->nn_param.reduce.type == VSI_NN_REDUCE_ANY) { + self->nn_param.reduce.local2->use_internal_node = TRUE; ret = op_set_reduce_internal(self, reshape_in_t, reshape_out_t, VSI_NN_OP_REDUCEANY_INTERNAL); } - return ret; } /* op_setup() */ @@ -1097,6 +1228,8 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { + vsi_bool use_interanl_node = self->nn_param.reduce.local2->use_internal_node; + if (self->nn_param.reduce.local.axis_tensor != NULL) { vsi_nn_ReleaseTensor(&(self->nn_param.reduce.local.axis_tensor)); @@ -1132,12 +1265,7 @@ static vsi_status op_deinit self->nn_param.reduce.local2 = NULL; } - if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM || - self->nn_param.reduce.type == VSI_NN_REDUCE_MAX || - self->nn_param.reduce.type == VSI_NN_REDUCE_MIN || - self->nn_param.reduce.type == VSI_NN_REDUCE_ALL || - self->nn_param.reduce.type == VSI_NN_REDUCE_ANY || - self->nn_param.reduce.type == VSI_NN_REDUCE_PROD) + if ( use_interanl_node ) { vsi_nn_internal_deinit_node_wksp(self); } @@ -1184,4 +1312,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c new file mode 100644 index 0000000..ced3cd7 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c @@ -0,0 +1,163 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +typedef struct _reduce_mean_internal_local_data_t { + int32_t placeholder; +} reduce_mean_internal_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + int32_t * axis = self->nn_param.reduce_mean_internal.axis; + int32_t axis_num = self->nn_param.reduce_mean_internal.axis_num; + float scale = self->nn_param.reduce_mean_internal.scale; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + uint32_t axis_size = 0; + uint32_t rank_in = 0; + uint32_t rank_out = 0; + vsi_bool ret = FALSE; + vsi_nn_kernel_param_t * param = NULL; + + ret = vsi_nn_kernel_optimize_reduce_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + axis, axis_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], &rank_in, shapes[1], &rank_out, + new_axis, &axis_size); + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "axis_num", axis_size ); + vsi_nn_kernel_param_add_float32( param, "scale", scale ); + + if (ret) + { + uint32_t i = 0; + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], shapes[0], rank_in ); + for (i = 0; i < axis_size; i++) + { + shapes[0][i] = 1; + } + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], shapes[0], rank_in ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "reduce_mean", + &reshape_tensors[0], 1, + &reshape_tensors[1], 1, param ); + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + } + + if ( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_reduce_mean_internal_param * p = &(self->nn_param.reduce_mean_internal); + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + uint32_t i = 0; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + + memcpy(outputs[0]->attr.size, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t)); + + for (i = 0; i < p->axis_num; i++) + { + outputs[0]->attr.size[p->axis[i]] = 1; + } + } + + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ REDUCE_MEAN_INTERNAL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c index c04009a..f9213ad 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c @@ -110,6 +110,7 @@ static vsi_bool op_check IO_TYPE(D_I32, D_I32, D_I32) IO_TYPE(D_I32, D_U32, D_U32) IO_TYPE(D_I32, D_F32, D_F32) + IO_TYPE(D_I32, D_BF16,D_BF16) END_IO_TYPE_DECL(SCATTER_ND) if(!VALIDATE_OP_IO_TYPES(SCATTER_ND, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c index 0f7ff2b..81a0afd 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c @@ -81,7 +81,7 @@ static vsi_status _create_split_softmax return VSI_SUCCESS; } /* _create_split_softmax() */ -static vsi_status vsi_nn_softmax_compute +vsi_status op_compute ( vsi_nn_node_t * self, vsi_nn_tensor_t ** inputs, @@ -161,7 +161,7 @@ static vsi_status vsi_nn_softmax_compute } return status; -} /* vsi_nn_softmax_compute() */ +} /* op_compute() */ static vsi_status op_optimize ( @@ -296,7 +296,7 @@ DEF_OP_REG ( /* op_name */ SOFTMAX_INTERNAL, /* init */ NULL, - /* compute */ vsi_nn_softmax_compute, + /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, /* setup */ vsi_nn_op_common_setup, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c index 86d46dd..8daa728 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c @@ -49,6 +49,7 @@ static vsi_status op_compute { status = VSI_FAILURE; } + self->n = (vx_node)n; return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c index a514514..1dbe3ca 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c @@ -118,6 +118,16 @@ static vsi_bool op_setup } } + if (1 == node->input.num) + { + curr = vsi_nn_internal_new_node( node, VSI_NN_OP_RESHAPE2, 1, 1); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; + curr->node->nn_param.reshape2.size = outputs[0]->attr.size; + goto final; + } + input_shape[0] = block_size; input_shape[1] = block_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index c0a0562..76495df 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -467,6 +467,7 @@ static vsi_bool op_check IO_TYPE(D_F16, D_BF16) IO_TYPE(D_F16, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I32|Q_ASYM) END_IO_TYPE_DECL(STRIDED_SLICE) if (!VALIDATE_OP_IO_TYPES(STRIDED_SLICE, self, inputs, self->input.num, outputs, self->output.num)) diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index 9f8ca77..bae1005 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -54,7 +54,7 @@ static void _try_open_file VSILOGW( "File handle is not NULL." ); fclose( *fp ); } - *fp = fopen( file_path, mode ); + *fp = vsi_nn_fopen( file_path, mode ); if( NULL == *fp ) { VSILOGE( "Open file %s fail.", file_path ); @@ -437,6 +437,9 @@ static _op_param_gen_t s_op_gen[] = /* GRUCELL_ACTIVATION */ NULL, /* RESHAPE2 */ NULL, /* CONV3D */ NULL, + /* DECONV3D */ NULL, + /* PAD2 */ NULL, + /* COS */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); @@ -557,4 +560,3 @@ void vsi_nn_GenGraphCCode _try_close_file( &s_dfile_hndl ); _try_close_file( &s_net_file_hndl ); } /* vsi_nn_GenGraphCCode() */ - diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c index acca854..92dedcc 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c @@ -267,6 +267,12 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm case U8: return vsi_nn_dtype_convert_float_to_quantize_asymm8( buffer, size, scale, zero_point, (uint8_t*)out_buffer ); + case I8: + return vsi_nn_dtype_convert_float_to_quantize_symm8( + buffer, size, scale, zero_point, (int8_t*)out_buffer ); + case I16: + return vsi_nn_dtype_convert_float_to_quantize_symm16( + buffer, size, scale, zero_point, (int16_t*)out_buffer ); default: VSILOGE("Don't support convert float to asymm quant %d.", dtype); break; @@ -413,6 +419,12 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float case U8: return vsi_nn_dtype_convert_quantize_asymm8_to_float( (const uint8_t *)buffer, size, scale, zero_point, out_buffer ); + case I8: + return vsi_nn_dtype_convert_quantize_symm8_to_float( + (const int8_t *)buffer, size, scale, zero_point, out_buffer ); + case I16: + return vsi_nn_dtype_convert_quantize_symm16_to_float( + (const int16_t *)buffer, size, scale, zero_point, out_buffer ); case I32: return vsi_nn_dtype_convert_quantize_symm32_to_float( (const int *)buffer, size, scale, zero_point, out_buffer ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index b05fdab..bd14b39 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -23,6 +23,7 @@ *****************************************************************************/ #include #include +#include #include #include #include @@ -103,6 +104,69 @@ _compiler_assert(VX_STATUS_MIN == -25, VX_STATUS_VALUE_CHANGED); static const int16_t vx_status_desc_cnt = _cnt_of_array( vx_status_desc ); +char* vsi_nn_strncpy + ( + char* dest, + const char* source, + size_t count + ) +{ + char* ret = NULL; + #ifdef _MSC_VER + strncpy_s(dest, count, source, _TRUNCATE); + #else + strncpy(dest, source, count); + #endif + return ret; +} + +char* vsi_nn_strncat + ( + char* dest, + const char* source, + size_t count + ) +{ + char* ret = NULL; + #ifdef _MSC_VER + strncat_s(dest, count, source, _TRUNCATE); + ret = dest; + #else + ret = strncat(dest, source, count); + #endif + return ret; +} + +char* vsi_nn_getenv + ( + const char * var_name + ) +{ + char* var = NULL; + #ifdef _MSC_VER + size_t var_size = 0; + _dupenv_s(&var, &var_size, var_name); + #else + var = getenv(var_name); + #endif + return var; +}; + +FILE* vsi_nn_fopen + ( + const char * file_name, + const char * mode + ) +{ + FILE * file = NULL; + #ifdef _MSC_VER + fopen_s(&file, file_name, mode); + #else + file = fopen(file_name, mode); + #endif + return file; +} + static vsi_size_t _compute_stride_rounding ( vsi_size_t out, @@ -148,7 +212,7 @@ uint8_t * vsi_nn_LoadBinaryData vsi_size_t cnt; FILE * fp; - fp = fopen( filename, "rb" ); + fp = vsi_nn_fopen( filename, "rb" ); if( NULL == fp ) { return NULL; @@ -867,21 +931,21 @@ void vsi_nn_FormatToString { switch(tensor->attr.dtype.vx_type) { - case VSI_NN_TYPE_INT4:strncpy(buf, "i4 ", buf_sz);break; - case VSI_NN_TYPE_INT8:strncpy(buf, "i8 ", buf_sz);break; - case VSI_NN_TYPE_INT16:strncpy(buf, "i16", buf_sz);break; - case VSI_NN_TYPE_INT32:strncpy(buf, "i32", buf_sz);break; - case VSI_NN_TYPE_INT64:strncpy(buf, "i64", buf_sz);break; - case VSI_NN_TYPE_UINT4:strncpy(buf, "u4 ", buf_sz);break; - case VSI_NN_TYPE_UINT8:strncpy(buf, "u8 ", buf_sz);break; - case VSI_NN_TYPE_UINT16:strncpy(buf, "u16", buf_sz);break; - case VSI_NN_TYPE_UINT32:strncpy(buf, "u32", buf_sz);break; - case VSI_NN_TYPE_UINT64:strncpy(buf, "u64", buf_sz);break; - case VSI_NN_TYPE_FLOAT16:strncpy(buf, "f16", buf_sz);break; - case VSI_NN_TYPE_FLOAT32:strncpy(buf, "f32", buf_sz);break; - case VSI_NN_TYPE_FLOAT64:strncpy(buf, "f64", buf_sz);break; - case VSI_NN_TYPE_BFLOAT16:strncpy(buf, "bf16", buf_sz);break; - case VSI_NN_TYPE_BOOL8:strncpy(buf, "bool8", buf_sz);break; + case VSI_NN_TYPE_INT4:vsi_nn_strncpy(buf, "i4 ", buf_sz);break; + case VSI_NN_TYPE_INT8:vsi_nn_strncpy(buf, "i8 ", buf_sz);break; + case VSI_NN_TYPE_INT16:vsi_nn_strncpy(buf, "i16", buf_sz);break; + case VSI_NN_TYPE_INT32:vsi_nn_strncpy(buf, "i32", buf_sz);break; + case VSI_NN_TYPE_INT64:vsi_nn_strncpy(buf, "i64", buf_sz);break; + case VSI_NN_TYPE_UINT4:vsi_nn_strncpy(buf, "u4 ", buf_sz);break; + case VSI_NN_TYPE_UINT8:vsi_nn_strncpy(buf, "u8 ", buf_sz);break; + case VSI_NN_TYPE_UINT16:vsi_nn_strncpy(buf, "u16", buf_sz);break; + case VSI_NN_TYPE_UINT32:vsi_nn_strncpy(buf, "u32", buf_sz);break; + case VSI_NN_TYPE_UINT64:vsi_nn_strncpy(buf, "u64", buf_sz);break; + case VSI_NN_TYPE_FLOAT16:vsi_nn_strncpy(buf, "f16", buf_sz);break; + case VSI_NN_TYPE_FLOAT32:vsi_nn_strncpy(buf, "f32", buf_sz);break; + case VSI_NN_TYPE_FLOAT64:vsi_nn_strncpy(buf, "f64", buf_sz);break; + case VSI_NN_TYPE_BFLOAT16:vsi_nn_strncpy(buf, "bf16", buf_sz);break; + case VSI_NN_TYPE_BOOL8:vsi_nn_strncpy(buf, "bool8", buf_sz);break; default: break; } @@ -1199,6 +1263,8 @@ int32_t vsi_nn_get_tensor_zero_point switch (tensor->attr.dtype.qnt_type) { case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: + zero_point = 0; + break; case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: zero_point = tensor->attr.dtype.zero_point; break; @@ -1226,7 +1292,14 @@ void vsi_nn_get_tensor_clamp_min_max } else if (vx_type == VSI_NN_TYPE_INT8) { - *clampMin = -128 - zero_point; + if (input->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC) + { + *clampMin = -127 - zero_point; + } + else + { + *clampMin = -128 - zero_point; + } *clampMax = 127 - zero_point; } else if (vx_type == VSI_NN_TYPE_INT16) diff --git a/src/tim/vx/internal/src/vsi_nn_client_op.c b/src/tim/vx/internal/src/vsi_nn_client_op.c index fcfa365..b7983d9 100644 --- a/src/tim/vx/internal/src/vsi_nn_client_op.c +++ b/src/tim/vx/internal/src/vsi_nn_client_op.c @@ -34,6 +34,7 @@ typedef struct _client_node { vsi_nn_op_t op; vsi_nn_op_proc_t proc; + const char* kernel_name; } _client_node_t; static vsi_nn_binary_tree_t * s_root = NULL; @@ -139,3 +140,41 @@ void vsi_nn_OpRemoveClient } } /* vsi_nn_OpRemoveClient() */ +vsi_bool vsi_nn_OpAddClientName + ( + vsi_nn_op_t op, + const char* kernel_name + ) +{ + _client_node_t * node; + vsi_bool ret; + + ret = FALSE; + node = (_client_node_t *)vsi_nn_BinaryTreeGetNode( + &s_root, + (vsi_nn_binary_tree_key_t)op ); + if( NULL != node && NULL != kernel_name) + { + node->kernel_name = kernel_name; + ret = TRUE; + } + return ret; +}/* vsi_nn_OpAddClientName() */ + +const char * vsi_nn_OpGetClientName + ( + vsi_nn_op_t op + ) +{ + _client_node_t * node; + + node = (_client_node_t *)vsi_nn_BinaryTreeGetNode( + &s_root, + (vsi_nn_binary_tree_key_t)op ); + + if( NULL != node ){ + return node->kernel_name; + }else{ + return NULL; + } +} /* vsi_nn_OpGetClientName() */ \ No newline at end of file diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index f453b32..acd5b4f 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -86,7 +86,7 @@ final: int32_t vsi_nn_getEnv(const char* name, char** env_s) { int32_t ret = 0; - *env_s = getenv(name); + *env_s = vsi_nn_getenv(name); if (*env_s) { ret = TRUE; } @@ -121,6 +121,13 @@ static vsi_status vsi_nn_initOptions options->enable_concat_optimize = atoi(env_s); } + env_s = NULL; + options->enable_asymi8_to_u8 = 1; + if (vsi_nn_getEnv("VSI_NN_ENABLE_I8TOU8", &env_s) && env_s) + { + options->enable_asymi8_to_u8 = atoi(env_s); + } + return VSI_SUCCESS; } diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index b721265..76f4f2c 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -614,8 +614,8 @@ void vsi_nn_ReleaseGraph uint32_t i; vsi_nn_graph_t * ptr; - ptr = *graph; - if( NULL != graph && NULL != * graph ) + ptr = (NULL != graph) ? *graph : NULL; + if( NULL != ptr) { if( NULL != ptr->nodes ) { @@ -657,7 +657,6 @@ void vsi_nn_ReleaseGraph free( ptr ); *graph = NULL; } - } /* vsi_nn_ReleaseGraph() */ /* @@ -1171,6 +1170,65 @@ vsi_nn_node_t * vsi_nn_AppendNode return vsi_nn_AddNode( graph, op, 0, 0, node_id ); } /* vsi_nn_AppendNode() */ +vsi_nn_node_t * vsi_nn_AddExternalNode + ( + vsi_nn_graph_t * graph, + vsi_nn_op_t op, + const void * proc, + vsi_nn_node_id_t * node_id, + const char * kernel_name + ) +{ + vsi_nn_node_t * node; + vsi_nn_node_id_t id; + vsi_nn_op_proc_t * node_proc; + + node_proc = (vsi_nn_op_proc_t*)proc; + + if( NULL == graph ) + { + return NULL; + } + node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) ); + + if( NULL != node ) + { + memset( node, 0, sizeof( vsi_nn_node_t ) ); + node->graph = graph; + node->op = op; + node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO; + node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; + + /* init op */ + if(node_proc->init != NULL){ + //TODO + } + + /* init output struct */ + node->output.num = node_proc->output_num; + node->output.tensors = (vsi_nn_tensor_id_t *) malloc( + node_proc->output_num * sizeof( vsi_nn_tensor_id_t ) ); + vsi_nn_InitTensorsId( node->output.tensors, node_proc->output_num ); + + /* init input struct */ + node->input.num = node_proc->input_num; + node->input.tensors = (vsi_nn_tensor_id_t *) malloc( + node_proc->input_num * sizeof( vsi_nn_tensor_id_t ) ); + vsi_nn_InitTensorsId( node->input.tensors, node_proc->input_num ); + node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE; + node->attr.enable_op_constraint_check = TRUE; + } + id = graph->cur_nid; + if(NULL != node){ + vsi_nn_MapAdd( graph->node_table, (vsi_nn_map_key_t)id, (void *)node ); + graph->node_num = graph->cur_nid; + graph->cur_nid ++; + } + vsi_nn_OpRegisterExternalOvxInit(op, kernel_name, node_proc); + return node; +} /* vsi_nn_AddExternalNode() */ + void vsi_nn_RemoveNode ( vsi_nn_graph_t * graph, @@ -1251,7 +1309,6 @@ vsi_bool vsi_nn_SetGraphOutputs } return ret; - } /* vsi_nn_SetGraphOutputs() */ vsi_nn_node_id_t * vsi_nn_SortGraphNode @@ -1507,10 +1564,10 @@ void vsi_nn_DumpGraphNodeOutputsEx if( NULL != prefix ) { - strncpy(filename_prefix, prefix, _SHAPE_BUF_SZ); + vsi_nn_strncpy(filename_prefix, prefix, _SHAPE_BUF_SZ); filename_prefix[_SHAPE_BUF_SZ - 1] = '\0'; - strncat(filename_prefix, "_", _SHAPE_BUF_SZ - 1); + vsi_nn_strncat(filename_prefix, "_", _SHAPE_BUF_SZ - 1); filename_prefix[_SHAPE_BUF_SZ - 1] = '\0'; } @@ -1611,7 +1668,7 @@ void vsi_nn_DumpGraphToJson return ; } - fp = fopen("graph.json", "w+"); + fp = vsi_nn_fopen("graph.json", "w+"); if(NULL == fp) { VSILOGE("Create dump file fail"); diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c index fb17d8b..8e57205 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -205,7 +205,6 @@ static void _get_graph_input_asymm_int8_norm_tensor { tensor_ids[id_count ++] = id; } - } tensor_count += 1; } @@ -867,7 +866,7 @@ vsi_status vsi_nn_OptimizeGraph } } - if (!nbg_flag) + if (!nbg_flag && graph->ctx->options.enable_asymi8_to_u8) { status = _graph_optimization_convert_int8_to_uint8(graph, dirty); TEST_CHECK_STATUS(status, final); @@ -876,4 +875,3 @@ vsi_status vsi_nn_OptimizeGraph final: return status; } /* vsi_nn_OptimizeGraph() */ - diff --git a/src/tim/vx/internal/src/vsi_nn_log.c b/src/tim/vx/internal/src/vsi_nn_log.c index 9ee1114..25d421b 100644 --- a/src/tim/vx/internal/src/vsi_nn_log.c +++ b/src/tim/vx/internal/src/vsi_nn_log.c @@ -40,7 +40,6 @@ static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL"; #endif int get_env_as_int(const char* env, int default_value) { - int value = default_value; #ifdef __ANDROID__ { @@ -52,7 +51,7 @@ int get_env_as_int(const char* env, int default_value) { } #else { - char* env_s = getenv(env); + char* env_s = vsi_nn_getenv(env); if (env_s) { value = atoi(env_s); } diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c index 0d5fbc8..86d4937 100644 --- a/src/tim/vx/internal/src/vsi_nn_node.c +++ b/src/tim/vx/internal/src/vsi_nn_node.c @@ -106,8 +106,8 @@ void vsi_nn_ReleaseNode ) { vsi_nn_node_t * ptr; - ptr = *node; - if( NULL != node && NULL != *node ) + ptr = (NULL != node) ? *node : NULL; + if( NULL != ptr) { vsi_nn_OpDeinit( ptr->op, ptr ); if( NULL != ptr->input.tensors ) diff --git a/src/tim/vx/internal/src/vsi_nn_ops.c b/src/tim/vx/internal/src/vsi_nn_ops.c index 0214680..8ca7df2 100644 --- a/src/tim/vx/internal/src/vsi_nn_ops.c +++ b/src/tim/vx/internal/src/vsi_nn_ops.c @@ -87,6 +87,15 @@ static const char * vsi_nn_internal_ops_name[] = }; #undef DEF_OP + +vsi_bool _is_external_ops(vsi_nn_op_t op) { + vsi_bool ret = FALSE; + if (op < 0) { + ret = TRUE; + } + return ret; +} + vsi_bool _is_custom_ops ( vsi_nn_op_t op @@ -357,13 +366,33 @@ vsi_bool vsi_nn_OpRegisterOvxInit return ret; } /* vsi_nn_OpRegisterClientCompute() */ +vsi_bool vsi_nn_OpRegisterExternalOvxInit + ( + vsi_nn_op_t op, + const char* kernel_name, + vsi_nn_op_proc_t* proc + ) +{ + vsi_bool ret; + + ret = FALSE; + if (vsi_nn_OpRegisterClient(op, proc) && + vsi_nn_OpAddClientName(op, kernel_name)) { + ret = TRUE; + } + return ret; +} + const char * vsi_nn_OpGetName ( vsi_nn_op_t op ) { const char * name; - if( op < VSI_NN_OP_NUM ) + if(_is_external_ops(op)){ + name = vsi_nn_OpGetClientName(op); + } + else if( op < VSI_NN_OP_NUM ) { name = vsi_nn_ops_name[op]; } diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index 06dd052..8c6c7ba 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -118,22 +118,37 @@ static void _set_preproc_node_rect_params ( vsi_nn_node_t* node, vsi_nn_preprocess_crop_t* crop, - vsi_nn_preprocess_image_size_t* input_size + vsi_nn_preprocess_image_size_t* input_size, + vsi_nn_preprocess_source_format_e* source_format ) { if(crop != NULL) { - node->nn_param.pre_process.rect.left = crop->begin[0]; - node->nn_param.pre_process.rect.top = crop->begin[1]; - node->nn_param.pre_process.rect.width = crop->size[0]; - node->nn_param.pre_process.rect.height = crop->size[1]; + if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR) + { + VSILOGW("don not need to set crop parameter for tensor preprocess"); + } + else + { + node->nn_param.pre_process.rect.left = crop->begin[0]; + node->nn_param.pre_process.rect.top = crop->begin[1]; + node->nn_param.pre_process.rect.width = crop->size[0]; + node->nn_param.pre_process.rect.height = crop->size[1]; + } } - else + else if (*source_format != VSI_NN_SOURCE_FORMAT_TENSOR) { - node->nn_param.pre_process.rect.left = 0; - node->nn_param.pre_process.rect.top = 0; - node->nn_param.pre_process.rect.width = input_size->w; - node->nn_param.pre_process.rect.height = input_size->h; + if(input_size == NULL) + { + VSILOGE("Please set image size for preprocess node"); + } + else + { + node->nn_param.pre_process.rect.left = 0; + node->nn_param.pre_process.rect.top = 0; + node->nn_param.pre_process.rect.width = input_size->w; + node->nn_param.pre_process.rect.height = input_size->h; + } } } /* _set_preproc_node_rect_params() */ @@ -490,7 +505,7 @@ vsi_status vsi_nn_add_single_preproc_node status = _set_preproc_node_type(node, source_format); TEST_CHECK_STATUS(status, final); - _set_preproc_node_rect_params(node, crop, input_size); + _set_preproc_node_rect_params(node, crop, input_size, source_format); _set_preproc_node_norm_params(node, mean_and_scale, &org_norm_tensor->attr); if(permute != NULL) diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index e82e537..80ea7a6 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -139,7 +139,7 @@ static void print_tensor break; #endif default: - strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ); + vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ); break; } @@ -475,18 +475,18 @@ static vsi_bool _init_tensor { #ifdef VSI_40BIT_VA_SUPPORT { - vx_size size_vxsize[_cnt_of_array(tensor->attr.size)] = {0}; + vx_size size_vxsize2[_cnt_of_array(tensor->attr.size)] = {0}; vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0}; for(i = 0; i < _cnt_of_array(tensor->attr.size); i++) { - size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i]; + size_vxsize2[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i]; } for(i = 0; i < _cnt_of_array(stride_size); i++) { stride_size_vxsize[i] = -1 == stride_size[i] ? -1 : (vx_size)stride_size[i]; } addr = vxCreateTensorAddressing(graph->ctx->c, - size_vxsize, stride_size_vxsize, (vx_size)tensor->attr.dim_num); + size_vxsize2, stride_size_vxsize, (vx_size)tensor->attr.dim_num); } #else { @@ -785,8 +785,8 @@ void vsi_nn_ReleaseTensor ) { vsi_nn_tensor_t * ptr; - ptr = *tensor; - if( NULL != tensor && NULL != *tensor ) + ptr = (NULL != tensor) ? *tensor : NULL; + if( NULL != ptr) { uint8_t * handle = NULL; if( NULL != ptr->t ) @@ -1224,7 +1224,7 @@ void vsi_nn_SaveTensorToTextByFp32 return; } - fp = fopen( filename, "w" ); + fp = vsi_nn_fopen( filename, "w" ); if( NULL == fp ) { VSILOGW( "Write file %s fail. Please check...", filename ); @@ -1313,7 +1313,7 @@ void vsi_nn_SaveDataToText return; } - fp = fopen( filename, "w" ); + fp = vsi_nn_fopen( filename, "w" ); if( NULL == fp ) { VSILOGW( "Write file %s fail. Please check...", filename ); @@ -1358,6 +1358,8 @@ void vsi_nn_SaveTensorToBinary FILE * fp; vsi_size_t sz; uint32_t i; + uint8_t * packed_data = NULL; + vsi_size_t packed_size; if( NULL == graph || NULL == tensor || NULL == filename ) { @@ -1365,24 +1367,42 @@ void vsi_nn_SaveTensorToBinary } data = vsi_nn_ConvertTensorToData( graph, tensor ); + if( NULL == data ) { VSILOGE( "Convert data fail." ); return; } - fp = fopen( filename, "wb" ); + fp = vsi_nn_fopen( filename, "wb" ); if( NULL == fp ) { VSILOGW( "Write file %s fail. Please check...", filename ); return; } sz = (vsi_size_t)vsi_nn_GetTypeBytes( tensor->attr.dtype.vx_type ); - for( i = 0; i < tensor->attr.dim_num; i ++ ) + if( tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 || + tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT4 ) { - sz *= tensor->attr.size[i]; + packed_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, + tensor->attr.dtype.vx_type); + packed_data = (uint8_t*)malloc(packed_size); + vsi_nn_Pack4bitData(tensor, data, packed_data); + fwrite( packed_data, packed_size, 1, fp ); + if( packed_data ) + { + free(packed_data); + packed_data = NULL; + } + } + else + { + for( i = 0; i < tensor->attr.dim_num; i ++ ) + { + sz *= tensor->attr.size[i]; + } + fwrite( data, sz, 1, fp ); } - fwrite( data, sz, 1, fp ); fclose( fp ); vsi_nn_safe_free( data ); } /* vsi_nn_SaveTensorToBinary() */ @@ -2720,4 +2740,4 @@ final: vsi_nn_safe_free(data); return output; -} \ No newline at end of file +}