From 1bb1e070f2136b4290d2680306e02f6521c5a469 Mon Sep 17 00:00:00 2001 From: Chen Feiyue <69809761+chenfeiyue-cfy@users.noreply.github.com> Date: Fri, 3 Nov 2023 13:16:33 +0800 Subject: [PATCH] Update internal to 1.1.88 release (#657) Internal ovxlib SHA 32fe479af5549e894bcd40de5740ae0dfd42bdb9 Type: Code Improvement Signed-off-by: Feiyue Chen --- src/tim/vx/internal/include/interface/ops.def | 1 + .../include/ops/vsi_nn_op_deconvolution.h | 2 +- .../include/ops/vsi_nn_op_max_pool3d.h | 1 + .../internal/include/ops/vsi_nn_op_moments.h | 9 + .../vx/internal/include/ops/vsi_nn_op_pool.h | 1 + .../internal/include/ops/vsi_nn_op_reducel2.h | 47 + .../include/utils/vsi_nn_dtype_util_prv.h | 245 +- src/tim/vx/internal/include/vsi_nn_graph.h | 14 +- .../include/vsi_nn_graph_optimization.h | 2 +- .../vx/internal/include/vsi_nn_node_type.h | 2 + src/tim/vx/internal/include/vsi_nn_ops.h | 2 +- .../include/vsi_nn_pre_post_process.h | 8 + .../vx/internal/include/vsi_nn_tensor_util.h | 7 + src/tim/vx/internal/include/vsi_nn_version.h | 2 +- .../ops/kernel/cpu/custom_warp_affine_cpu.c | 64 +- .../ops/kernel/evis/custom_warp_affine_evis.c | 150 +- .../custom/ops/vsi_nn_op_custom_warp_affine.c | 4 +- src/tim/vx/internal/src/kernel/cl/clip_cl.c | 2 +- src/tim/vx/internal/src/kernel/cl/cumsum_cl.c | 20 +- src/tim/vx/internal/src/kernel/cl/gather_cl.c | 33 +- .../internal/src/kernel/cl/log_softmax_cl.c | 17 +- .../src/kernel/cl/lstmunit_activation_cl.c | 5 +- .../vx/internal/src/kernel/cl/matrixmul_cl.c | 248 +- .../vx/internal/src/kernel/cl/moments_cl.c | 225 +- src/tim/vx/internal/src/kernel/cl/pool_cl.c | 318 ++ .../vx/internal/src/kernel/evis/clip_evis.c | 2 +- .../src/kernel/evis/comparisons_evis.c | 2 +- .../vx/internal/src/kernel/evis/cumsum_evis.c | 20 +- .../vx/internal/src/kernel/evis/gather_evis.c | 35 +- .../src/kernel/evis/grucell_activation_evis.c | 77 +- .../kernel/evis/grucell_activation_sma_evis.c | 45 +- .../kernel/evis/grucell_activation_z_h_evis.c | 46 +- .../evis/grucell_h_times_activation_r_evis.c | 46 +- .../grucell_reset_after_activation_evis.c | 46 +- .../kernel/evis/lstmunit_activation_evis.c | 86 +- .../internal/src/kernel/evis/matrixmul_evis.c | 77 +- .../vx/internal/src/kernel/evis/pool_evis.c | 374 ++ .../src/kernel/evis/pre_process_nv12_evis.c | 148 +- .../src/kernel/evis/pre_process_yuv422_evis.c | 27 + .../kernel/evis/resize_bilinear_nhwc_evis.c | 10 +- .../src/kernel/evis/scatter_nd_update_evis.c | 60 +- .../kernel/vsi_nn_kernel_gpu_shape_optimize.c | 22 + .../src/kernel/vsi_nn_kernel_selector.c | 12 + src/tim/vx/internal/src/kernel/vx/gather_vx.c | 82 + src/tim/vx/internal/src/kernel/vx/pow_vx.c | 73 + .../internal/src/kernel/vx/relationalops_vx.c | 83 + src/tim/vx/internal/src/kernel/vx/tile_vx.c | 78 + .../vx/internal/src/libnnext/ops/cl/cumsum.cl | 8 +- .../internal/src/libnnext/ops/cl/cumsum_2d.cl | 21 +- .../src/libnnext/ops/cl/log_softmax_axis0.cl | 152 +- .../src/libnnext/ops/cl/log_softmax_axis1.cl | 154 +- .../src/libnnext/ops/cl/log_softmax_axis2.cl | 67 + .../src/libnnext/ops/cl/matrixmul_4x.cl | 127 + .../internal/src/libnnext/ops/cl/maxpool.cl | 217 + .../src/libnnext/ops/cl/moments_axis01.cl | 63 + .../src/libnnext/ops/vx/custom_warp_affine.vx | 151 - .../libnnext/ops/vx/custom_warp_affine_2d.vx | 158 + .../ops/vx/custom_warp_affine_optional.vx | 341 ++ .../ops/vx/custom_warp_affine_rgb_optional.vx | 333 ++ .../src/libnnext/ops/vx/gather_array.vx | 34 +- .../src/libnnext/ops/vx/grucell_activation.vx | 53 + .../libnnext/ops/vx/grucell_activation_sma.vx | 103 + .../libnnext/ops/vx/grucell_activation_z_h.vx | 52 + .../ops/vx/grucell_cdnn_activation_bf16.vx | 344 ++ .../ops/vx/grucell_h_times_activation_r.vx | 39 + .../ops/vx/grucell_reset_after_activation.vx | 65 + .../ops/vx/lstmunit_activation_BP_BF16.vx | 124 + .../ops/vx/lstmunit_activation_B_BF16.vx | 126 + .../ops/vx/lstmunit_activation_CBP_BF16.vx | 111 + .../ops/vx/lstmunit_activation_CB_BF16.vx | 113 + .../ops/vx/lstmunit_activation_CLP_BF16.vx | 101 + .../ops/vx/lstmunit_activation_CL_BF16.vx | 102 + .../ops/vx/lstmunit_activation_CSP_BF16.vx | 104 + .../ops/vx/lstmunit_activation_CS_BF16.vx | 106 + .../ops/vx/lstmunit_activation_LP_BF16.vx | 110 + .../ops/vx/lstmunit_activation_L_BF16.vx | 112 + .../ops/vx/lstmunit_activation_SP_BF16.vx | 117 + .../ops/vx/lstmunit_activation_S_BF16.vx | 118 + .../internal/src/libnnext/ops/vx/maxpool.vx | 283 ++ .../libnnext/ops/vx/pre_process_nv12_copy.vx | 20 +- .../libnnext/ops/vx/pre_process_nv12_scale.vx | 38 +- .../ops/vx/pre_process_rgb888_planar_sep_1.vx | 2 +- .../ops/vx/pre_process_yuv422_copy.vx | 17 +- .../ops/vx/pre_process_yuv422_scale.vx | 23 +- .../src/libnnext/ops/vx/relational_ops_2d.vx | 4 +- .../src/libnnext/ops/vx/relational_ops_3d.vx | 4 +- .../libnnext/ops/vx/scatter_nd_update_qint.vx | 58 +- .../src/libnnext/vsi_nn_libnnext_resource.c | 4339 +++++++++++++++-- src/tim/vx/internal/src/makefile.linux | 261 - .../vx/internal/src/ops/vsi_nn_op_argmaxmin.c | 1 + src/tim/vx/internal/src/ops/vsi_nn_op_cast.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_concat.c | 3 + .../vx/internal/src/ops/vsi_nn_op_conv2d.c | 10 +- .../internal/src/ops/vsi_nn_op_dataconvert.c | 2 + .../src/ops/vsi_nn_op_deconvolution.c | 24 +- .../vx/internal/src/ops/vsi_nn_op_gather.c | 24 - .../vx/internal/src/ops/vsi_nn_op_grucell.c | 4 + .../src/ops/vsi_nn_op_lstmunit_ovxlib.c | 11 + .../vx/internal/src/ops/vsi_nn_op_matrixmul.c | 45 +- .../internal/src/ops/vsi_nn_op_max_pool3d.c | 28 +- .../vx/internal/src/ops/vsi_nn_op_moments.c | 235 +- src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c | 85 +- src/tim/vx/internal/src/ops/vsi_nn_op_pool.c | 57 +- .../internal/src/ops/vsi_nn_op_pre_process.c | 190 +- .../vx/internal/src/ops/vsi_nn_op_reduce.c | 11 +- .../vx/internal/src/ops/vsi_nn_op_reducel2.c | 183 + .../src/ops/vsi_nn_op_relational_ops.c | 4 + src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c | 10 + .../src/ops/vsi_nn_op_strided_slice.c | 29 +- src/tim/vx/internal/src/ops/vsi_nn_op_tile.c | 99 +- .../internal/src/utils/vsi_nn_binary_tree.c | 2 +- .../src/utils/vsi_nn_code_generator.c | 1 + .../vx/internal/src/vip/virtual_device.cpp | 2 +- .../internal/src/vip/virtual_device_private.h | 2 +- src/tim/vx/internal/src/vsi_nn_graph.c | 114 +- src/tim/vx/internal/src/vsi_nn_node.c | 104 +- .../internal/src/vsi_nn_node_attr_template.c | 1 + .../vx/internal/src/vsi_nn_pre_post_process.c | 148 +- src/tim/vx/internal/src/vsi_nn_tensor.c | 11 + src/tim/vx/internal/src/vsi_nn_types_prv.h | 28 + 120 files changed, 11472 insertions(+), 1753 deletions(-) create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_reducel2.h create mode 100644 src/tim/vx/internal/src/kernel/cl/pool_cl.c create mode 100644 src/tim/vx/internal/src/kernel/evis/pool_evis.c create mode 100644 src/tim/vx/internal/src/kernel/vx/gather_vx.c create mode 100644 src/tim/vx/internal/src/kernel/vx/pow_vx.c create mode 100644 src/tim/vx/internal/src/kernel/vx/relationalops_vx.c create mode 100644 src/tim/vx/internal/src/kernel/vx/tile_vx.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_2d.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_optional.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb_optional.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation_bf16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CLP_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CL_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_LP_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_L_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_BF16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/maxpool.vx delete mode 100644 src/tim/vx/internal/src/makefile.linux create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index 0753df0..0a1424e 100755 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -194,3 +194,4 @@ DEF_OP(INVERSE_SIGMOID) DEF_OP(GRID_SAMPLE) DEF_OP(LPNORM) DEF_OP(RESIZE_3D) +DEF_OP(REDUCEL2) diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution.h b/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution.h index f8bc670..d1a004f 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_deconvolution.h @@ -35,7 +35,7 @@ typedef struct _vsi_nn_deconv_param uint32_t ksize[2]; uint32_t stride[2]; /* Pad left, right, top, bottom */ - uint32_t pad[4]; + int32_t pad[4]; /* Pad type default value shall be AUTO */ uint32_t pad_type; uint32_t weights; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h index 043d9e0..50fb246 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h @@ -44,6 +44,7 @@ typedef struct _vsi_nn_max_pool3d_param uint32_t pad[6]; /* Pad type default value shall be AUTO */ vsi_nn_pad_e pad_type; + uint32_t dilation[3]; } vsi_nn_max_pool3d_param; _compiler_assert(offsetof(vsi_nn_max_pool3d_param, local) == 0, \ vsi_nn_max_pool3d_h ); diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h b/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h index fd6427a..4932f15 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h @@ -30,11 +30,20 @@ extern "C" { #endif +typedef struct _vsi_nn_moments_lcl_data +{ + vsi_bool use_internal_node; + uint32_t perm[VSI_NN_MAX_DIM_NUM]; + int32_t axis[VSI_NN_MAX_DIM_NUM]; +} vsi_nn_moments_lcl_data; + typedef struct _vsi_nn_moments_param { const int32_t* axis; int32_t axis_num; vsi_bool keep_dim; + + vsi_nn_moments_lcl_data *lcl_data; } vsi_nn_moments_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h index ee32df3..fed17fd 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h @@ -50,6 +50,7 @@ typedef struct _vsi_nn_pool_param vsi_nn_pad_e pad_type; /* poolwithargmax layer local data structure */ vsi_nn_pool_lcl_data *local; + uint32_t dilation[2]; } vsi_nn_pool_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducel2.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducel2.h new file mode 100644 index 0000000..473bbe2 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducel2.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_REDUCEL2_H +#define _VSI_NN_OP_REDUCEL2_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_reducel2_param +{ + struct _reducel2_local_data_t * lcl; + vx_int32 *axis; + vx_uint32 axis_num; + vx_bool keep_dim; +} vsi_nn_reducel2_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h index 367ff88..9d32dfb 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -466,77 +466,148 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) { float fp8_f32 = in / scale; - int32_t fp8_i32 = *((int32_t*)&fp8_f32); - //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 3) - 1)); - int32_t eps = 1 << (23 - 3 - 1); - fp8_i32 += eps; - //fp8_i32 &= mask; - { - int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; - int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff; - int expShiftValue = FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT; - int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7; + int32_t in_val = *((int32_t*)&fp8_f32); - exp = (exp + expShiftValue) & 0xF; + uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */ + uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */ + uint32_t in_man = (in_val & 0x7FFFFF); /* low 23 bits is man */ - return (uint8_t)(sign << 7 | exp << 3 | mantissa); + uint32_t out_sign = in_sign; + int32_t out_exp = (in_exp + FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT); /* in_exp - fp32bias + SE4M3 bias */ + uint32_t man_rounding = 0, out_man = 0, out_val = 0; + + man_rounding = (in_man + 0x80000) >> 20; /* manrounding is 3 bits */ + if (((man_rounding >> 3) && 0x1) == 1) { + /* when in_man like 0b11_1, exp += 1, mantissa is 0*/ + out_exp += 1; } + + /* Clamp Denorm to zero */ + if (out_exp <= 0) { + out_exp = 0; + man_rounding = 0; + out_sign = 0; + } + + out_man = man_rounding & 0x7; /* keep low 3 bits of man */ + /* overflow policy */ + if (out_exp >= 16 || (out_exp == 15 && out_man == 7)) { + out_exp = 15; + out_man = 6; +#if 0 + if (mode == VX_CONVERT_POLICY_SATURATE) { + out_exp = 15; + out_man = 6; + } else if (mode == VX_CONVERT_POLICY_INF) { + out_exp = 15; + out_man = 7; + } else { + vxmASSERT(0 && "Error overflow mode!\n"); + } +#endif + } + out_val = (out_sign << 7) | (out_exp << 3) | out_man; + return (uint8_t)(out_val & 0xFF); } /* fp32_to_fp8_e4m3() */ static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) { float fp8_f32 = in / scale; - int32_t fp8_i32 = *((int32_t*)&fp8_f32); - //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 2) - 1)); - int32_t eps = 1 << (23 - 2 - 1); - fp8_i32 += eps; - //fp8_i32 &= mask; - { - int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; - int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff; - int expShiftValue = FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT; - int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x3; + int32_t in_val = *((int32_t*)&fp8_f32); + uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */ + uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */ + uint32_t in_man = (in_val & 0x7FFFFF); /* low 23 bits is man */ - exp = (exp + expShiftValue) & 0x1F; + uint32_t out_sign = in_sign; + int32_t out_exp = (in_exp + FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT); /* in_exp - fp32bias + SE5M2 bias */ + uint32_t man_rounding = 0, out_man = 0, out_val = 0; - return (uint8_t)(sign << 7 | exp << 2 | mantissa); + man_rounding = (in_man + 0x100000) >> 21; /* manrounding is 2 bits */ + if (((man_rounding >> 2) && 0x1) == 1) { + /* when in_man like 0b11, exp += 1, mantissa is 0*/ + out_exp += 1; } + + /* Clamp Denorm to zero */ + if (out_exp <= 0) { + out_exp = 0; + man_rounding = 0; + out_sign = 0; + } + + out_man = man_rounding & 0x3; /* keep low 9 bits of man */ + /* overflow policy */ + if (out_exp >= 31) { + out_exp = 30; + out_man = 3; +#if 0 + if (mode == VX_CONVERT_POLICY_SATURATE) { + out_exp = 30; + out_man = 3; + } else if (mode == VX_CONVERT_POLICY_INF) { + out_exp = 31; + out_man = 0; + } else { + vxmASSERT(0 && "Error overflow mode!\n"); + } +#endif + } + out_val = (out_sign << 7) | (out_exp << 2) | out_man; + return (uint8_t)(out_val & 0xFF); } /* fp32_to_fp8_e5m2() */ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) { float val_fp32; - uint32_t signOut = 0; uint32_t exponentOut = 0; uint32_t mantissaOut = 0; uint32_t out_u = 0; - uint32_t signIn; - uint32_t exponentIn; - uint32_t mantissaIn; - int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT; - - signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1; - exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF; - mantissaIn = in & 0x7; - - signOut = signIn; - - if (exponentIn == 0 && mantissaIn == 0) { - goto final; + uint32_t signIn; + uint32_t exponentIn; + uint32_t mantissaIn; + uint32_t expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT; + //uint32_t i = 0; + //uint32_t intMsk = 0x4; + + signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1; + exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF; + mantissaIn = in & 0x7; + + signOut = signIn; + + /* clamp subnorm*/ + if (exponentIn == 0) { + goto final; + } + /* + if (exponentIn == 0 && mantissaIn == 0) + { + break; + } + else if (exponentIn == 0) + { + while (!(mantissaIn & intMsk)) + { + intMsk >>= 1; + ++i; + } + exponentOut = (exponentIn + expShiftValue - i) & 0xff; + mantissaIn = ((mantissaIn ^ intMsk) << (i + 1)); + mantissaOut = (mantissaIn << (FLOAT_MATISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff; + break; + } + */ + + if (exponentIn == 0xf && mantissaIn == 0x7) { + exponentOut = 0xff; + mantissaOut = 0x400000; + goto final; + } + + exponentOut = (exponentIn + expShiftValue) & 0xff; + mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff; } - - if (exponentIn == 0xf && mantissaIn == 0x7) - { - exponentOut = 0xff; - mantissaOut = 0x400000; - goto final; - } - - exponentOut = (exponentIn + expShiftValue) & 0xff; - mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff; - - final: out_u = signOut << 31 | exponentOut << 23 | mantissaOut; val_fp32 = *((float*)&out_u); @@ -546,44 +617,60 @@ final: static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) { float val_fp32; - uint32_t signOut = 0; uint32_t exponentOut = 0; uint32_t mantissaOut = 0; uint32_t out_u = 0; - uint32_t signIn; - uint32_t exponentIn; - uint32_t mantissaIn; - int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT; - - signIn = (in >> 7) & 0x1; - exponentIn = (in >> 2) & 0x1F; - mantissaIn = in & 0x3; - - signOut = signIn; - - if (exponentIn == 0 && mantissaIn == 0) { - goto final; + uint32_t signIn; + uint32_t exponentIn; + uint32_t mantissaIn; + uint32_t expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT; + //uint32_t i = 0; + //uint32_t intMsk = 0x2; + + signIn = (in >> (FLOAT8_E5M2_EXPONENT_SIZE + FLOAT8_E5M2_MANTISSA_SIZE)) & 0x1; + exponentIn = (in >> FLOAT8_E5M2_MANTISSA_SIZE) & 0x1F; + mantissaIn = in & 0x3; + + signOut = signIn; + + /* clamp subnorm*/ + if (exponentIn == 0) { + goto final; + } + /* + if (exponentIn == 0 && mantissaIn == 0) + { + break; + } + else if (exponentIn == 0) + { + while (!(mantissaIn & intMsk)) + { + intMsk >>= 1; + ++i; + } + exponentOut = (exponentIn + expShiftValue - i) & 0xff; + mantissaIn = ((mantissaIn ^ intMsk) << (i + 1)); + mantissaOut = (mantissaIn << (FLOAT_MATISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff; + break; + } + */ + + if (exponentIn == 0x1f && mantissaIn == 0x3) { + exponentOut = 0xff; + mantissaOut = 0x400000; + goto final; + } + + exponentOut = (exponentIn + expShiftValue) & 0xff; + mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff; } - - if (exponentIn == 0x1f && mantissaIn == 0x3) - { - exponentOut = 0xff; - mantissaOut = 0x400000; - goto final; - } - - - exponentOut = (exponentIn + expShiftValue) & 0xff; - mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff; - - - final: +final: out_u = signOut << 31 | exponentOut << 23 | mantissaOut; val_fp32 = *((float*)&out_u); - return val_fp32 * scale; } /* fp8_e5m2_to_fp32() */ diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h index 8504791..4053988 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph.h +++ b/src/tim/vx/internal/include/vsi_nn_graph.h @@ -241,7 +241,7 @@ OVXLIB_API vsi_status vsi_nn_VerifyGraph */ OVXLIB_API vsi_status vsi_nn_RunGraph ( - const vsi_nn_graph_t * graph + vsi_nn_graph_t * graph ); /** @@ -273,7 +273,7 @@ OVXLIB_API vsi_status vsi_nn_AsyncRunGraph OVXLIB_API vsi_status vsi_nn_AsyncRunWait ( - vsi_nn_graph_t * graph + vsi_nn_graph_t * graph ); /** @@ -556,7 +556,7 @@ OVXLIB_API vsi_bool vsi_nn_SetGraphOutputs * @param[in] graph Graph handle * @param[in] id Node id to be removed. */ -void vsi_nn_RemoveNode +OVXLIB_API void vsi_nn_RemoveNode ( vsi_nn_graph_t * graph, vsi_nn_node_id_t id @@ -788,6 +788,14 @@ OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop vsi_nn_graph_t* graph, vsi_nn_tensor_t *max_iteration_tensor ); + +OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption + ( + vsi_nn_graph_t* graph, + const char* ctrl_str, + size_t size + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_graph_optimization.h b/src/tim/vx/internal/include/vsi_nn_graph_optimization.h index 1f43353..ad3f689 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph_optimization.h +++ b/src/tim/vx/internal/include/vsi_nn_graph_optimization.h @@ -39,7 +39,7 @@ vx_tensor vsi_nn_CreateRawTensorFromData vsi_nn_tensor_attr_t * attr ); -vsi_status vsi_nn_OptimizeGraph +OVXLIB_API vsi_status vsi_nn_OptimizeGraph ( vsi_nn_graph_t* graph, vsi_bool *dirty diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index 5cadddb..f961835 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -208,6 +208,7 @@ #include "ops/vsi_nn_op_grid_sample.h" #include "ops/vsi_nn_op_lpnorm.h" #include "ops/vsi_nn_op_resize_3d.h" +#include "ops/vsi_nn_op_reducel2.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" #include "ops/vsi_nn_op_inverse_sigmoid.h" @@ -404,6 +405,7 @@ typedef union _vsi_nn_nn_param vsi_nn_grid_sample_param gridsample; vsi_nn_lpnorm_param lpnorm; vsi_nn_resize_3d_param resize_3d; + vsi_nn_reducel2_param reducel2; void* client_param; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_ops.h b/src/tim/vx/internal/include/vsi_nn_ops.h index de26f0d..6bbe637 100644 --- a/src/tim/vx/internal/include/vsi_nn_ops.h +++ b/src/tim/vx/internal/include/vsi_nn_ops.h @@ -268,7 +268,7 @@ vsi_status vsi_nn_OpOptimize * * @return VSI_SUCCESS on success, or error code otherwise. */ -vsi_bool vsi_nn_OpCheck +OVXLIB_API vsi_bool vsi_nn_OpCheck ( vsi_nn_op_t op, vsi_nn_node_t * node, diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h index 59292cd..9cfae60 100644 --- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h +++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h @@ -264,6 +264,14 @@ OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam uint32_t enable_nodes_count ); +OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly + ( + vsi_nn_graph_t* graph, + vsi_nn_node_id_t* enable_nodes, + vsi_bool* crop_set_start_only, + uint32_t enable_nodes_count + ); + OVXLIB_API vsi_status vsi_nn_UpdateCropParamsForBinaryGraph ( vsi_nn_graph_t* graph, diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h index 14bb0d6..3441489 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h @@ -614,6 +614,13 @@ OVXLIB_API vsi_status vsi_nn_SwapTensorHandle vsi_nn_tensor_t * tensor1 ); +OVXLIB_API vsi_status vsi_nn_SwapTensorHandleWithCache + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * tensor0, + vsi_nn_tensor_t * tensor1 + ); + OVXLIB_API vsi_size_t vsi_nn_vxGetTensorElementNum ( vsi_nn_tensor_attr_t *attr diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 399e72e..97fd959 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 84 +#define VSI_NN_VERSION_PATCH 88 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c index b9e77c2..31c1a29 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c @@ -42,7 +42,7 @@ __BEGIN_DECLS /* * Define kernel meta. */ -#define _INPUT_NUM (1) +#define _INPUT_NUM (2) #define _OUTPUT_NUM (1) #define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) #define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.custom_warp_affine") @@ -54,6 +54,7 @@ __BEGIN_DECLS static vx_param_description_t _custom_warp_affine_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, @@ -66,8 +67,9 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] = // Add kererl parameters here }; #define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def ) -#define SCALAR_INPUT_TYPE (2) -#define SCALAR_MATRIX_OFFSET (3) +#define SCALAR_INPUT_TYPE (3) +#define SCALAR_MATRIX_OFFSET (4) +#define SCALAR_INPUT_RGB_TYPE (10) static void _transform_affine ( @@ -142,44 +144,60 @@ DEF_KERNEL_EXECUTOR(_compute) tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); - attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); - CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); - out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); /* alloc the float32 data buffer */ - buffer[1] = (float *)malloc(out_elements * sizeof(float)); - CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final ); - memset(buffer[1], 0, out_elements * sizeof(float)); - buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + if (tensors[1]) + { + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final ); + } + + buffer[2] = (float *)malloc(out_elements * sizeof(float)); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create input buffer fail.", final ); + memset(buffer[2], 0, out_elements * sizeof(float)); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE], &type); - status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rgb_type); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_RGB_TYPE], &rgb_type); CHECK_STATUS_FAIL_GOTO(status, final ); for (i = 0; i < 6; i++) { - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], - &matrix[i]); - CHECK_STATUS_FAIL_GOTO(status, final ); + if (buffer[1] == NULL) + { + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], + &matrix[i]); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + matrix[i] = buffer[1][i]; + } } - width = attr[1]->shape->data[0]; - height = attr[1]->shape->data[1]; - for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i) + width = attr[2]->shape->data[0]; + height = attr[2]->shape->data[1]; + for(i = 2; i < (vsi_size_t)attr[2]->shape->size; ++i) { - outer_size *= attr[1]->shape->data[i]; + outer_size *= attr[2]->shape->data[i]; } // Do something for (b = 0; b < outer_size; b++) { float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1]; - float *dst_base = buffer[1] + b * width * height; + float *dst_base = buffer[2] + b * width * height; if ( rgb_type == VSI_NN_WARP_AFFINE_TYPE_RGB ) { @@ -274,8 +292,8 @@ DEF_KERNEL_EXECUTOR(_compute) } } - status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], - buffer[1], out_elements ); + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); CHECK_STATUS_FAIL_GOTO( status, final ); final: for( i = 0; i < _CPU_IO_NUM; i ++ ) @@ -350,7 +368,7 @@ static vsi_nn_kernel_node_t _setup node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create( graph, F32, &buffer[i] ); } - node_params[9] = vsi_nn_kernel_scalar_create( + node_params[SCALAR_INPUT_RGB_TYPE] = vsi_nn_kernel_scalar_create( graph, I32, &rgb_type ); /* Pass parameters to node. */ @@ -360,7 +378,7 @@ static vsi_nn_kernel_node_t _setup { vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); } - vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_RGB_TYPE] ); } } return node; diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c index 3272fd6..81f8351 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c @@ -49,29 +49,52 @@ typedef enum _custom_warp_affine_type_e bilinear = VSI_NN_INTERPOLATION_BILINEAR, }custom_warp_affine_type_e; +#define _CUSTOM_WARP_AFFINE_2D_KERNEL_SOURCE "custom_warp_affine_2d" #define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE "custom_warp_affine" #define _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE "custom_warp_affine_rgb" +#define _CUSTOM_WARP_AFFINE_OPTIONAL_KERNEL_SOURCE "custom_warp_affine_optional" +#define _CUSTOM_WARP_AFFINE_RGB_OPTIONAL_KERNEL_SOURCE "custom_warp_affine_rgb_optional" // Add kernel hashtable here -#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE ) \ - (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | (RGB_TYPE << 24)) +#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE, OPTIONAL_INTPUT ) \ + (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | \ + (RGB_TYPE << 24) | (OPTIONAL_INTPUT << 28)) #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ - { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0 ), \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0, 0 ), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \ _CUSTOM_WARP_AFFINE_KERNEL_SOURCE } #define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ - { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0 ), \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0, 0 ), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \ - _CUSTOM_WARP_AFFINE_KERNEL_SOURCE } + _CUSTOM_WARP_AFFINE_2D_KERNEL_SOURCE } + +#define PACK_OPTIONAL_INPUT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0, 1 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_optional_input"), \ + _CUSTOM_WARP_AFFINE_OPTIONAL_KERNEL_SOURCE } +#define PACK_OPTIONAL_INPUT_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0, 1 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D_optional_input"), \ + _CUSTOM_WARP_AFFINE_OPTIONAL_KERNEL_SOURCE } + #define PACK_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ - { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1 ), \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1, 0 ), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb"), \ _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE } #define PACK_RGB_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ - { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1 ), \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1, 0 ), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D"), \ _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE } +#define PACK_OPTIONAL_INPUT_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1, 1 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_optional_input"), \ + _CUSTOM_WARP_AFFINE_RGB_OPTIONAL_KERNEL_SOURCE } +#define PACK_RGB_2D_OPTIONAL_INPUT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1, 1 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D_optional_input"), \ + _CUSTOM_WARP_AFFINE_RGB_OPTIONAL_KERNEL_SOURCE } + typedef struct { uint32_t key; @@ -84,15 +107,23 @@ static const _kernel_map_type _custom_warp_affine_kernel_map[] = // Register kernel here PACK_KERNEL_MAP( U8, U8, nearest_neighbor ), PACK_KERNEL_MAP( U8, U8, bilinear ), + PACK_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, bilinear ), PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ), PACK_2D_KERNEL_MAP( U8, U8, bilinear ), + PACK_OPTIONAL_INPUT_2D_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_OPTIONAL_INPUT_2D_KERNEL_MAP( U8, U8, bilinear ), PACK_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ), PACK_RGB_KERNEL_MAP( U8, U8, bilinear ), + PACK_OPTIONAL_INPUT_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_OPTIONAL_INPUT_RGB_KERNEL_MAP( U8, U8, bilinear ), PACK_RGB_2D_KERNEL_MAP( U8, U8, nearest_neighbor ), PACK_RGB_2D_KERNEL_MAP( U8, U8, bilinear ), + PACK_RGB_2D_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_RGB_2D_OPTIONAL_INPUT_KERNEL_MAP( U8, U8, bilinear ), }; /* @@ -110,8 +141,21 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; -#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def ) -#define SCALAR_MATRIX_OFFSET (2) + +static vx_param_description_t _custom_warp_affine_optinal_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_optinal_kernel_param_def ) /* * Kernel initializer */ @@ -138,17 +182,21 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer) float matrix1[4] = {0}; float matrix4[4] = {0}; int32_t i = 0; - - VSI_UNREFERENCED(param_size); + uint32_t scalar_matrix_offset = 3; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); - attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[param_size - 7] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + if (param_size == 8) + { + scalar_matrix_offset = 2; + } + for (i = 0; i < 6; i++) { - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[scalar_matrix_offset + i], &m[i]); CHECK_STATUS_FAIL_GOTO(status, final ); } @@ -170,13 +218,16 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer) / gpu_param.global_scale[1]); gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; - status = vsi_nn_kernel_gpu_add_param( node, - "matrix0", &matrix0 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "matrix1", &matrix1 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "matrix4", &matrix4 ); - CHECK_STATUS_FAIL_GOTO(status, final ); + if (param_size == 8) + { + status = vsi_nn_kernel_gpu_add_param( node, + "matrix0", &matrix0 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "matrix1", &matrix1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "matrix4", &matrix4 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } status = vsi_nn_kernel_gpu_config( node, &gpu_param ); @@ -217,17 +268,21 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer) float matrix0[4] = {0}; float matrix1[4] = {0}; int32_t i = 0; - - VSI_UNREFERENCED(param_size); + uint32_t scalar_matrix_offset = 3; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); - attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[param_size - 7] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + if (param_size == 8) + { + scalar_matrix_offset = 2; + } + for (i = 0; i < 6; i++) { - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[scalar_matrix_offset + i], &m[i]); CHECK_STATUS_FAIL_GOTO(status, final ); } @@ -248,11 +303,14 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer) / gpu_param.global_scale[1]); gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; - status = vsi_nn_kernel_gpu_add_param( node, - "matrix0", &matrix0 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "matrix1", &matrix1 ); - CHECK_STATUS_FAIL_GOTO(status, final ); + if (param_size == 8) + { + status = vsi_nn_kernel_gpu_add_param( node, + "matrix0", &matrix0 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "matrix1", &matrix1 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } status = vsi_nn_kernel_gpu_config( node, &gpu_param ); @@ -280,7 +338,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, int32_t type, - int32_t rgb_type + int32_t rgb_type, + int32_t optional_input ) { vsi_status status = VSI_FAILURE; @@ -289,6 +348,7 @@ static vsi_status _query_kernel const _kernel_map_type * kernel_map = _custom_warp_affine_kernel_map; size_t kernel_map_size = _cnt_of_array( _custom_warp_affine_kernel_map ); vx_param_description_t * param_def = _custom_warp_affine_kernel_param_def; + size_t param_def_size = _cnt_of_array( _custom_warp_affine_kernel_param_def ); vx_kernel_initialize_f initializer = _custom_warp_affine_initializer; int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1; uint32_t key = 0; @@ -297,7 +357,12 @@ static vsi_status _query_kernel in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type ); + if (optional_input == 1) + { + param_def = _custom_warp_affine_optinal_kernel_param_def; + param_def_size = _cnt_of_array(_custom_warp_affine_optinal_kernel_param_def); + } + key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type, optional_input ); if (rgb_type == 1) { initializer = _custom_warp_affine_rgb_initializer; @@ -313,7 +378,7 @@ static vsi_status _query_kernel { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); kernel->info.parameters = param_def; - kernel->info.numParams = _cnt_of_array( _custom_warp_affine_kernel_param_def ); + kernel->info.numParams = (vx_uint32)param_def_size; kernel->info.initialize = initializer; // Register code source vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, @@ -348,13 +413,23 @@ static vsi_nn_kernel_node_t _setup int32_t type = vsi_nn_kernel_param_get_int32( params, "type"); int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type"); float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size ); + int32_t optional_input = 1; + uint32_t scalar_matrix_offset = 3; + uint32_t param_num = _CUSTOM_WARP_AFFINE_PARAM_NUM; + if (inputs[1] == NULL) + { + optional_input = 0; + input_num = 1; + scalar_matrix_offset = scalar_matrix_offset - 1; + param_num = param_num - 1; + } if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) { return NULL; } - status = _query_kernel( kernel, inputs, outputs, type, rgb_type ); + status = _query_kernel( kernel, inputs, outputs, type, rgb_type, optional_input ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -364,19 +439,20 @@ static vsi_nn_kernel_node_t _setup border.mode = VX_BORDER_CONSTANT; /* Set inputs and outputs */ - vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM, + vsi_nn_kernel_node_pack_io( node_params, param_num, inputs, input_num, outputs, output_num ); for (i = 0; i < buffer_size; i++) { - node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create( + node_params[scalar_matrix_offset + i] = vsi_nn_kernel_scalar_create( graph, F32, &buffer[i] ); } /* Pass parameters to node. */ - status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( node, node_params, param_num ); for (i = 0; i < buffer_size; i++) { - vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); + vsi_nn_kernel_scalar_release( &node_params[scalar_matrix_offset + i] ); } + // Set default border mode. border.constant_value.U32 = 0x00000000; status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c index 5ee37c5..94a2d70 100644 --- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c +++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c @@ -42,7 +42,7 @@ typedef struct _custom_warp_affine_local_data_t { /* Declare number of input and output. */ -#define _INPUT_NUM (1) +#define _INPUT_NUM (2) #define _OUTPUT_NUM (1) static vsi_status op_compute @@ -63,7 +63,7 @@ static vsi_status op_compute self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "custom_warp_affine", - inputs, 1, + inputs, 2, outputs, 1, param ); vsi_nn_kernel_param_release( ¶m ); diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c index ec74f36..8aa2e2a 100644 --- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c @@ -269,7 +269,7 @@ static vsi_nn_kernel_node_t _setup ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank); - if ( ret ) + if ( !ret ) { return NULL; } diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c index 8dca931..3a5e0d7 100644 --- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c @@ -297,21 +297,13 @@ static vsi_nn_kernel_node_t _setup if (axis < 0) { - axis_new = 0; - shapes[0][0] = 1; - shapes[0][1] = 1; - for (i = 0; i < inputs[0]->attr.dim_num; i++) - { - shapes[0][0] *= inputs[0]->attr.size[i]; - } - rs_dim = 2; - } - else - { - vsi_nn_kernel_optimize_softmax_shape( - inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, - shapes[0], &rs_dim, &axis_new); + axis += (int32_t)inputs[0]->attr.dim_num; } + + vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rs_dim, &axis_new); + if (rs_dim > 3) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c index a3fa2d6..e6a6743 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c @@ -327,19 +327,40 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" ); - int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); - int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); - int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); - int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" ); + int32_t block_size = 1; + int32_t block_num = 1; + int32_t axis_num = 0; + int32_t indices_num = 1; int32_t is_batch = batch_dims > 0 ? 1 : 0; vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3; - int32_t is_array = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0; - int32_t i = 0; + int32_t is_array = 0; + uint32_t i = 0; + vsi_size_t *input_size = inputs[0]->attr.size; + uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num; + uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); + for (i = 0; i < (uint32_t)axis; ++i) + { + block_size *= (int32_t)input_size[i]; + } + + axis_num = (int32_t)input_size[axis]; + for (i = axis + 1; i < r_rank - batch_dims; ++i) + { + block_num *= (int32_t)input_size[i]; + } + for (i = 0; i < q_rank - batch_dims; ++i) + { + indices_num *= (int32_t)inputs[1]->attr.size[i]; + } + + is_array = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0; + status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array); status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array); status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array); diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c index 3fc716c..f7089bf 100644 --- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c @@ -60,8 +60,13 @@ __BEGIN_DECLS HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \ VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, +#define TENSOR_LOG_SOFTMAX_BFLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \ + HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ + VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, + #define HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D") + CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D") #define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ @@ -73,6 +78,11 @@ __BEGIN_DECLS HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \ VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, +#define TENSOR_LOG_SOFTMAX_BFLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \ + { HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \ + HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \ + VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) }, + static const struct { uint32_t key; char* function_name; @@ -85,11 +95,16 @@ static const struct { TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16) TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16) TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16) + TENSOR_LOG_SOFTMAX_BFLOAT(0, BF16, BF16) + TENSOR_LOG_SOFTMAX_BFLOAT(1, BF16, BF16) + TENSOR_LOG_SOFTMAX_BFLOAT(2, BF16, BF16) TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32) TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32) TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16) TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16) + TENSOR_LOG_SOFTMAX_BFLOAT_2D(0, BF16, BF16) + TENSOR_LOG_SOFTMAX_BFLOAT_2D(1, BF16, BF16) TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8) TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8) diff --git a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c index dec27e3..aeb8b4d 100644 --- a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -1572,8 +1571,8 @@ static vsi_nn_kernel_node_t _setup if (outputs[LSTMUNIT_ACT_OUTPUT] && VSI_NN_TYPE_UINT8 == outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.vx_type) { - scale_val[8] = 1.0f / vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_OUTPUT]); - tail_val[8] = (float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_OUTPUT]); + scale_val[8] = 1.0f / vsi_nn_get_tensor_scale(outputs[LSTMUNIT_ACT_OUTPUT]); + tail_val[8] = (float)vsi_nn_get_tensor_zero_point(outputs[LSTMUNIT_ACT_OUTPUT]); } if( VSI_SUCCESS == status) diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c index de336c9..ac342d3 100644 --- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c @@ -35,6 +35,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" __BEGIN_DECLS @@ -44,6 +45,7 @@ __BEGIN_DECLS #define KERNEL_SOURCE_1 "matrixmul" #define KERNEL_SOURCE_2 "matrixmul_transA" #define KERNEL_SOURCE_3 "matrixmul_cross" +#define KERNEL_SOURCE_4 "matrixmul_4x" typedef enum { @@ -51,8 +53,9 @@ __BEGIN_DECLS _3D } vsi_nn_kernel_image_dim_type_e; -#define HASH_MATRIXMUL_KEY(_type0, _type1, _type2, _image_dim, _trans_a, _cross) \ - ((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_image_dim << 4) | (_trans_a << 2) | (_cross)) +#define HASH_MATRIXMUL_KEY(_type0, _type1, _type2, _image_dim, flag_4x, _trans_a, _cross) \ + ((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_image_dim << 6) | \ + (flag_4x << 4) | (_trans_a << 2) | (_cross)) #define HASH_MATRIXMUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) @@ -66,23 +69,39 @@ __BEGIN_DECLS #define HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge") +#define HASH_MATRIXMUL_4X_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ + CVIVANTE_NAMESPACE("cl.gemm_4x_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) + +#define HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ + CVIVANTE_NAMESPACE("cl.gemm_4x_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) + #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ - { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0), \ + { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 0), \ HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ SOURCE }, +#define TENSOR_MATRIXMUL_4X_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ + {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 0, 0), \ + HASH_MATRIXMUL_4X_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ + SOURCE }, + +#define TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ + {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 1, 0), \ + HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ + SOURCE }, + #define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ - { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 0), \ + {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 1, 0), \ HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ SOURCE }, #define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ - { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 0), \ + {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 2, 0), \ HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ SOURCE }, #define TENSOR_MATRIXMUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ - { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 2), \ + {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 2), \ HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ SOURCE }, @@ -92,35 +111,37 @@ static const struct { const char* source_name; } matrixmul_map[] = { - TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_2) - TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_2) - TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _2D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _3D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_2) - TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_2) - TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_2) - TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_2) - TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_2) - TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_2) - TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1) - TENSOR_MATRIXMUL_MERGE_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_3) - TENSOR_MATRIXMUL_MERGE_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_3) - TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3) + TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_2) + TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_MERGE_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_3) + TENSOR_MATRIXMUL_MERGE_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_3) + TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3) + TENSOR_MATRIXMUL_4X_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4) + TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(F32, F32, F32, _2D, KERNEL_SOURCE_4) }; /* @@ -252,12 +273,53 @@ final: return status; } /* _matrixmul_initializer() */ +DEF_KERNEL_INITIALIZER(_matrixmul_4x_initializer) +(vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t* param, + size_t param_size) { + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; + + vsi_nn_kernel_tensor_attr_t* attr = NULL; + vsi_size_t width = 0; + vsi_size_t height = 0; + + VSI_UNREFERENCED(param_size); + + attr = + vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]); + CHECK_PTR_FAIL_GOTO(attr, "Create tensor attr buffer fail.", final); + + width = attr->shape->data[0]; + height = attr->shape->data[1]; + + gpu_param.dim = 2; + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0]; + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config(node, &gpu_param); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr) { + vsi_nn_kernel_tensor_attr_release(&attr); + attr = NULL; + } + return status; +} /* _matrixmul_4x_initializer() */ + static vsi_status _query_kernel ( vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, vsi_size_t depth, + int32_t flag_4x, int32_t transa, int32_t cross ) @@ -317,7 +379,7 @@ static vsi_status _query_kernel output_dtype = U8; } - key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa, cross ); + key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, flag_4x, transa, cross ); for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ ) { @@ -340,7 +402,13 @@ static vsi_status _query_kernel kernel->info.parameters = _matrixmul_merge_kernel_param_def; kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def ); } - kernel->info.initialize = _matrixmul_initializer; + + if (flag_4x) { + kernel->info.initialize = _matrixmul_4x_initializer; + } else { + kernel->info.initialize = _matrixmul_initializer; + } + // Register code source vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "eltwise_ops_helper", @@ -352,6 +420,8 @@ static vsi_status _query_kernel } return status; } /* _query_kernel() */ + + static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, @@ -368,8 +438,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); - int32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" ); int32_t transFlg = 0; + int32_t flag_4x = 0; vsi_size_t M = inputs[0]->attr.size[1]; vsi_size_t K = inputs[0]->attr.size[0]; vsi_size_t N = inputs[1]->attr.size[0]; @@ -385,6 +455,22 @@ static vsi_nn_kernel_node_t _setup float scale_out = vsi_nn_get_tensor_scale(outputs[0]); float zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]); int32_t outer = 0; + vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + uint32_t final_rank = 0; + vsi_nn_tensor_t* rs_in_tensors = NULL; + vsi_nn_tensor_t* rs_out_tensors = NULL; + vsi_nn_tensor_t* final_in_tensors[2] = {NULL}; + vsi_nn_tensor_t* final_out_tensors[1] = {NULL}; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input1_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + uint32_t new_rank[3] = {0}; + uint32_t cross_flg = 0; + uint32_t size_axis_in_out[3] = {0}; + uint32_t stride_axis_in_out[9] = {0}; + vsi_nn_tensor_t* tmp_inputs[2] = {NULL}; + vsi_nn_tensor_t* tmp_outputs[1] = {NULL}; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); @@ -397,6 +483,33 @@ static vsi_nn_kernel_node_t _setup return NULL; } + status = vsi_nn_kernel_optimize_matrixmul_broadcast_shape( + inputs[0]->attr.size, + inputs[1]->attr.size, + outputs[0]->attr.size, + inputs[0]->attr.dim_num, + inputs[1]->attr.dim_num, + outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], new_rank, + &cross_flg, size_axis_in_out, stride_axis_in_out); + if (status) + { + tmp_inputs[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], new_rank[0]); + tmp_inputs[1] = vsi_nn_reshape_tensor(graph, inputs[1], shapes[1], new_rank[1]); + tmp_outputs[0] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[2], new_rank[2]); + + M = tmp_inputs[0]->attr.size[1]; + K = tmp_inputs[0]->attr.size[0]; + N = tmp_inputs[1]->attr.size[0]; + depth = tmp_outputs[0]->attr.dim_num > 2 ? tmp_outputs[0]->attr.size[2] : 1; + } + else + { + VSILOGE("illegal inputs shape"); + status = VSI_FAILURE; + goto final; + } + if (transposeB) { N = inputs[1]->attr.size[1]; @@ -410,8 +523,8 @@ static vsi_nn_kernel_node_t _setup transFlg = 1; } - a_depth = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; - b_depth = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; + a_depth = tmp_inputs[0]->attr.dim_num > 2 ? tmp_inputs[0]->attr.size[2] : 1; + b_depth = tmp_inputs[1]->attr.dim_num > 2 ? tmp_inputs[1]->attr.size[2] : 1; if (b_depth == 1) { @@ -422,14 +535,14 @@ static vsi_nn_kernel_node_t _setup ac2zero = 1; } - if (inputs[0]->attr.dim_num == 4 && inputs[1]->attr.dim_num == 3 + if (tmp_inputs[0]->attr.dim_num == 4 && tmp_inputs[1]->attr.dim_num == 3 && a_depth > 1 && b_depth > 1 && cross_flg == 2) { ac2zero = 1; bc2zero = 0; outer = (int32_t)a_depth; } - else if (inputs[1]->attr.dim_num == 4 && inputs[0]->attr.dim_num == 3 + else if (tmp_inputs[1]->attr.dim_num == 4 && tmp_inputs[0]->attr.dim_num == 3 && a_depth > 1 && b_depth > 1 && cross_flg == 2) { ac2zero = 0; @@ -437,7 +550,46 @@ static vsi_nn_kernel_node_t _setup outer = (int32_t)b_depth; } - status = _query_kernel( kernel, inputs, outputs, depth, transFlg, cross_flg ); + final_in_tensors[0] = tmp_inputs[0]; + final_in_tensors[1] = tmp_inputs[1]; + final_out_tensors[0] = tmp_outputs[0]; + + input0_dtype = vsi_nn_kernel_map_dtype(tmp_inputs[0]->attr.dtype.vx_type); + input1_dtype = vsi_nn_kernel_map_dtype(tmp_inputs[1]->attr.dtype.vx_type); + output_dtype = vsi_nn_kernel_map_dtype(tmp_outputs[0]->attr.dtype.vx_type); + + + if (((transFlg == 0) || (transFlg == 1)) && (cross_flg == 0) && + (F32 == input0_dtype) && (F32 == input1_dtype) && (F32 == output_dtype)) + { + vsi_size_t in1_w = tmp_inputs[1]->attr.size[0]; + vsi_size_t in1_h = tmp_inputs[1]->attr.size[1]; + vsi_size_t in1_c = tmp_inputs[1]->attr.dim_num > 2 ? tmp_inputs[1]->attr.size[2] : 1; + vsi_size_t in1_n = tmp_inputs[1]->attr.dim_num > 3 ? tmp_inputs[1]->attr.size[3] : 1; + vsi_size_t out_w = tmp_outputs[0]->attr.size[0]; + vsi_size_t out_h = tmp_outputs[0]->attr.size[1]; + vsi_size_t out_c = tmp_outputs[0]->attr.dim_num > 2 ? tmp_outputs[0]->attr.size[2] : 1; + vsi_size_t out_n = tmp_outputs[0]->attr.dim_num > 3 ? tmp_outputs[0]->attr.size[3] : 1; + if ((in1_w == 1) && (in1_h % 4 == 0) && (in1_c == 1) && (in1_n == 1) && + (out_w == 1) && (out_h % 4 == 0) && (out_c == 1) && (out_n == 1)) + { + final_shape[0] = in1_h; + final_shape[1] = in1_w; + final_rank = 2; + rs_in_tensors = vsi_nn_reshape_tensor(graph, tmp_inputs[1], final_shape, final_rank); + final_in_tensors[1] = rs_in_tensors; + + final_shape[0] = out_h; + final_shape[1] = out_w; + final_rank = 2; + rs_out_tensors = vsi_nn_reshape_tensor(graph, tmp_outputs[0], final_shape, final_rank); + final_out_tensors[0] = rs_out_tensors; + + flag_4x = 1; + } + } + + status = _query_kernel(kernel, tmp_inputs, tmp_outputs, depth, flag_4x, transFlg, cross_flg); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -447,7 +599,7 @@ static vsi_nn_kernel_node_t _setup size_t param_num = cross_flg == 2 ? _MATRIXMUL_MERGE_PARAM_NUM : _MATRIXMUL_PARAM_NUM; /* Pass parameters to node. */ vsi_nn_kernel_node_pack_io( node_params, param_num, - inputs, 2, outputs, 1 ); + final_in_tensors, 2, final_out_tensors, 1 ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N ); @@ -483,6 +635,14 @@ static vsi_nn_kernel_node_t _setup } } } + +final: + vsi_safe_release_tensor(tmp_inputs[0]); + vsi_safe_release_tensor(tmp_inputs[1]); + vsi_safe_release_tensor(tmp_outputs[0]); + vsi_safe_release_tensor(rs_in_tensors); + vsi_safe_release_tensor(rs_out_tensors); + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c index 4afda36..9e6a8f1 100644 --- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c @@ -35,7 +35,8 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "kernel/vsi_nn_kernel_eltwise.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_dtype_util.h" __BEGIN_DECLS @@ -114,6 +115,7 @@ static const _kernel_map_type moments_map[] = TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,F32, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F32, 1, 2, KERNEL_SOURCE_4) TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F32, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2, KERNEL_SOURCE_5) @@ -140,63 +142,6 @@ static vx_param_description_t _moments_kernel_param_def[] = }; #define _MOMENTS_PARAM_NUM _cnt_of_array( _moments_kernel_param_def ) -static int32_t set_constant_border - ( - vsi_nn_kernel_node_t node, - int32_t value - ) -{ - vsi_status status = VSI_FAILURE; - vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - border.constant_value.S32 = value; - border.constant_value.U32 = (vx_uint32)value; - border.constant_value.S16 = (vx_int16)value; - border.constant_value.U8 = (vx_uint8)value; - status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); - return status; -} - -static int32_t get_moments_output_reshape_size - ( - vsi_nn_tensor_t ** outputs, - vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], - int32_t* axis, - int32_t axis_num - ) -{ - uint32_t out_dims_num = outputs[0]->attr.dim_num; - vsi_size_t *output_size = outputs[0]->attr.size; - uint32_t i = 0; - int32_t out_rs_flg = 0; - - for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i) - { - sizes[i] = 1; - } - sizes[3] = out_dims_num > 3 ? output_size[3] : 1; - - if (axis_num == 1 && axis[0] == 0) - { - sizes[0] = output_size[1]; - sizes[1] = out_dims_num > 2 ? output_size[2] : 1; - out_rs_flg = 1; - } - else if (axis_num == 1 && axis[0] == 1) - { - sizes[0] = output_size[0]; - sizes[1] = out_dims_num > 2 ? output_size[2] : 1; - out_rs_flg = 1; - } - else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1) - { - sizes[0] = out_dims_num > 2 ? output_size[2] : 1; - out_rs_flg = 1; - } - - return out_rs_flg; -} /* _get_moments_tensor_reshape_size */ - /* * Kernel initializer */ @@ -247,26 +192,39 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) gpu_param.global_size[0] = gpu_align_p2((height + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = chn; + gpu_param.global_size[2] = 1; } else if (axis_num == 1 && axis == 1) { gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = chn; + gpu_param.global_size[2] = 1; } else if (axis_num == 1 && axis == 2) { gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = height; + gpu_param.global_size[2] = 1; } - else if (axis_num == 2) + else if (axis_num == 2 && axis == 0) { gpu_param.local_size[0] = 16; gpu_param.local_size[1] = 1; gpu_param.local_size[2] = 1; gpu_param.global_size[0] = 16; gpu_param.global_size[1] = chn; + gpu_param.global_size[2] = 1; + } + else if (axis_num == 2 && axis == 1) + { + gpu_param.local_size[0] = 8; + gpu_param.local_size[1] = 8; + gpu_param.local_size[2] = 1; + gpu_param.global_size[0] = 8; + gpu_param.global_size[1] = 8; + gpu_param.global_size[2] = width; } else if (axis_num == 3) { @@ -275,8 +233,8 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) gpu_param.local_size[2] = 1; gpu_param.global_size[0] = 16; gpu_param.global_size[1] = 1; + gpu_param.global_size[2] = 1; } - gpu_param.global_size[2] = 1; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); CHECK_STATUS_FAIL_GOTO(status, final); @@ -366,117 +324,78 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - vsi_size_t out_shape[VSI_NN_MAX_DIM_NUM] = {0}; - vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t out_rs_flg = 0; - int32_t axis_num = 0; - size_t axis_num_temp = 0; - int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp); - int32_t keep_dim = vsi_nn_kernel_param_get_int32( params, "keep_dim" ); + size_t axis_num = 0; + int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num); int32_t first_axis = axis[0]; - int32_t i = 0; + uint32_t i = 0; vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL}; - vsi_nn_kernel_tensor_t reshape_tensors[3] = { NULL }; - - vsi_size_t width = inputs[0]->attr.size[0]; - vsi_size_t height = inputs[0]->attr.size[1]; - vsi_size_t chn = inputs[0]->attr.size[2]; + uint32_t axis_size = 0; + uint32_t rank_in = 0; + uint32_t rank_out = 0; + vsi_bool ret = FALSE; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 1, 1, 1, 1 } }; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]); float input_scale = vsi_nn_get_tensor_scale(inputs[0]); - float dim_ratio = (float)1.0 / (float)(width * height); + float dim_ratio = 1; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); - axis_num = (int32_t)axis_num_temp; + ret = vsi_nn_kernel_optimize_reduce_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + axis, (vsi_size_t)axis_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], &rank_in, shapes[1], &rank_out, + new_axis, &axis_size); - if (axis_num == 1 && axis[0] == 0) - { - dim_ratio = (float)1.0 / (float)(width); - } - else if (axis_num == 1 && axis[0] == 1) - { - dim_ratio = (float)1.0 / (float)(height); - } - else if (axis_num == 1 && axis[0] == 2) - { - dim_ratio = (float)1.0 / (float)(chn); - } - else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1) - { - dim_ratio = (float)1.0 / (float)(width * height); - } - else if (axis_num == 3) - { - dim_ratio = (float)1.0 / (float)(width * height * chn); - } - - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) + if ( ret == FALSE || axis_size > 3 || (axis_size == 3 && new_axis[0] != 0)) { return NULL; } - if (keep_dim) + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[1], rank_out ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[1], shapes[1], rank_out ); + + first_axis = new_axis[0]; + + for ( i = 0; i < axis_size; i++ ) { - out_rs_flg = get_moments_output_reshape_size(&outputs[0], out_shape, axis, axis_num); + dim_ratio = dim_ratio / (float)(shapes[0][new_axis[i]]); } - if (inputs[0]->attr.dim_num < 2) + if ( !vsi_nn_kernel_gpu_check_shape( shapes[0], rank_in) ) { - shape[0] = inputs[0]->attr.size[0]; - shape[1] = 1; - reshape_tensors[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 2 ); - } - if (outputs[0]->attr.dim_num < 2) - { - shape[0] = outputs[0]->attr.size[0]; - shape[1] = 1; - reshape_tensors[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 2 ); - reshape_tensors[2] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, shape, 2 ); + return NULL; } scalar_list[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &first_axis ); - scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); + scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_size ); scalar_list[ZP] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp ); scalar_list[SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); - scalar_list[WIDTH] = vsi_nn_kernel_scalar_create( graph, I32, &width ); - scalar_list[HEIGHT] = vsi_nn_kernel_scalar_create( graph, I32, &height ); - scalar_list[CHN] = vsi_nn_kernel_scalar_create( graph, I32, &chn ); + scalar_list[WIDTH] = vsi_nn_kernel_scalar_create( graph, I32, &shapes[0][0] ); + scalar_list[HEIGHT] = vsi_nn_kernel_scalar_create( graph, I32, &shapes[0][1] ); + scalar_list[CHN] = vsi_nn_kernel_scalar_create( graph, I32, &shapes[0][2] ); scalar_list[DIMRATIO] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio ); - status = _query_kernel( inputs, outputs, kernel, params, axis, axis_num, 0 ); + status = _query_kernel( inputs, outputs, kernel, params, new_axis, axis_size, 0 ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); if ( node ) { uint32_t index = 0; - int32_t constant_value = vsi_nn_get_tensor_zero_point(inputs[0]); + vx_border_t border; /* Pass parameters to node. */ - if (reshape_tensors[0]) - { - node_params[index++] = reshape_tensors[0]; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t); - } - if (out_rs_flg) - { - node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, out_shape, 4 ); - node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, out_shape, 4 ); - } - else if (reshape_tensors[1]) - { - node_params[index++] = reshape_tensors[1]; - node_params[index++] = reshape_tensors[2]; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[0]->t); - node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[1]->t); - } + node_params[index++] = reshape_tensors[0]->t; + node_params[index++] = reshape_tensors[1]->t; + node_params[index++] = reshape_tensors[2]->t; + node_params[index++] = scalar_list[AXIS]; node_params[index++] = scalar_list[AXIS_NUM]; node_params[index++] = scalar_list[ZP]; @@ -487,29 +406,19 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = scalar_list[DIMRATIO]; status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM ); CHECK_STATUS(status); - if (out_rs_flg) - { - vsi_nn_kernel_tensor_release( &node_params[1] ); - vsi_nn_kernel_tensor_release( &node_params[2] ); - } - status = set_constant_border(node, constant_value); + // Set default border mode. + border.mode = VX_BORDER_CONSTANT; + vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype); + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); } } - if (reshape_tensors[0]) - { - vsi_nn_kernel_tensor_release( &reshape_tensors[0] ); - } - if (reshape_tensors[1]) - { - vsi_nn_kernel_tensor_release( &reshape_tensors[1] ); - } - if (reshape_tensors[2]) - { - vsi_nn_kernel_tensor_release( &reshape_tensors[2] ); - } + vsi_safe_release_tensor(reshape_tensors[0]); + vsi_safe_release_tensor(reshape_tensors[1]); + vsi_safe_release_tensor(reshape_tensors[2]); + /* Pass parameters to node. */ for( i = 0; i < INTERNAL_MOMENTS_SCALAR_NUM; i ++ ) { diff --git a/src/tim/vx/internal/src/kernel/cl/pool_cl.c b/src/tim/vx/internal/src/kernel/cl/pool_cl.c new file mode 100644 index 0000000..6ca352d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/pool_cl.c @@ -0,0 +1,318 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +#define _MAXPOOL_KERNEL_SOURCE_NAME "maxpool" + +typedef enum +{ + _error = -1, + _MAX = 0, + _AVG +} vsi_nn_pool_type_e; + +// Add kernel hashtable here +#define POOL_HASH_KEY( IN_DTYPE0, OUT_DTYPE, POOL_DTYPE ) \ + (( IN_DTYPE0 << 16 ) | ( OUT_DTYPE << 8 ) | ( POOL_DTYPE )) +#define MAXPOOL_KERNELS( IN_DTYPE0, OUT_DTYPE ) \ + { POOL_HASH_KEY( IN_DTYPE0, OUT_DTYPE, _MAX ), \ + CVIVANTE_NAMESPACE("cl.maxpool_"#IN_DTYPE0"to"#OUT_DTYPE), \ + _MAXPOOL_KERNEL_SOURCE_NAME }, + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type kernel_map[] = +{ + // Register kernel here + MAXPOOL_KERNELS( I32, I32 ) + MAXPOOL_KERNELS( U32, U32 ) + MAXPOOL_KERNELS( F32, F32 ) + MAXPOOL_KERNELS( U32, F32 ) + MAXPOOL_KERNELS( F32, U32 ) +}; + + +/* + * Kernel params + */ + +static vx_param_description_t _maxpool_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _MAXPOOL_PARAM_NUM _cnt_of_array( _maxpool_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_maxpool_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vx_tensor output = (vx_tensor)param[1]; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_size_array_t *output_shape = NULL; + + VSI_UNREFERENCED(param_size); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = (output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]; + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = (output_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2]; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + + return status; +} /* _maxpool_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t pool_type + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype = U8; + vsi_nn_kernel_dtype_e out_dtype = U8; + uint32_t key = 0; + uint32_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (in_dtype == U8) + { + in_dtype = U32; + } + else if (in_dtype == F16) + { + in_dtype = F32; + } + else if (in_dtype == I8 || in_dtype == I16) + { + in_dtype = I32; + } + + if (out_dtype == U8) + { + out_dtype = U32; + } + else if (out_dtype == F16) + { + out_dtype = F32; + } + else if (out_dtype == I8 || out_dtype == I16) + { + out_dtype = I32; + } + + key = POOL_HASH_KEY( in_dtype, out_dtype, pool_type ); + + for ( i = 0; i < (uint32_t)_cnt_of_array(kernel_map); i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)_cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = _maxpool_kernel_param_def; + kernel->info.numParams = (uint32_t)_cnt_of_array(_maxpool_kernel_param_def); + kernel->info.initialize = _maxpool_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_MAXPOOL_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t pool_type = vsi_nn_kernel_param_get_int32( params, "pool_type" ); + int32_t pool_size_x = vsi_nn_kernel_param_get_int32( params, "pool_size_x" ); + int32_t pool_size_y = vsi_nn_kernel_param_get_int32( params, "pool_size_y" ); + int32_t pool_pad_x_left = vsi_nn_kernel_param_get_int32( params, "pool_pad_x_left" ); + int32_t pool_pad_y_top = vsi_nn_kernel_param_get_int32( params, "pool_pad_y_top" ); + int32_t stride_x = vsi_nn_kernel_param_get_int32( params, "stride_x" ); + int32_t stride_y = vsi_nn_kernel_param_get_int32( params, "stride_y" ); + int32_t dilation_x = vsi_nn_kernel_param_get_int32( params, "dilation_x" ); + int32_t dilation_y = vsi_nn_kernel_param_get_int32( params, "dilation_y" ); + int32_t kernel_dia_x = pool_size_x * dilation_x; + int32_t kernel_dia_y = pool_size_y * dilation_y; + float output_scale = vsi_nn_get_tensor_scale(outputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float inout_scale = input_scale / output_scale; + float inout_tail = output_zp - input_zp * inout_scale; + int32_t width = (int32_t)inputs[0]->attr.size[0]; + int32_t height = (int32_t)inputs[0]->attr.size[1]; + + if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num )) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, pool_type ); + + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + uint32_t index = 2; + vsi_nn_kernel_node_pack_io( node_params, _MAXPOOL_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_x_left ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_y_top ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inout_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inout_tail ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOL_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + vsi_nn_kernel_scalar_release( &node_params[13] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( pool, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c index add96c2..1218322 100644 --- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c @@ -590,7 +590,7 @@ static vsi_nn_kernel_node_t _setup ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank); - if ( ret ) + if ( !ret ) { return NULL; } diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c index 4547dfb..bc5e267 100644 --- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c @@ -349,7 +349,7 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer) input1Scale = (float)((int64_t)1 << -fl); } } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) { input1Scale = attr[1]->asymm.scale; input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale; diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c index dbdd513..9ed9c08 100644 --- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c @@ -866,21 +866,13 @@ static vsi_nn_kernel_node_t _setup if (axis < 0) { - axis_new = 0; - shapes[0][0] = 1; - shapes[0][1] = 1; - for (i = 0; i < inputs[0]->attr.dim_num; i++) - { - shapes[0][0] *= inputs[0]->attr.size[i]; - } - rs_dim = 2; - } - else - { - vsi_nn_kernel_optimize_softmax_shape( - inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, - shapes[0], &rs_dim, &axis_new); + axis += (int32_t)inputs[0]->attr.dim_num; } + + vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rs_dim, &axis_new); + if (rs_dim > 3) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index ba7ad75..cf4411e 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -250,7 +250,8 @@ static vsi_status get_gather_tensor_reshape_size sizes[0] = block_size; sizes[1] = elementCnt / block_size; sizes[2] = outerCnt; - if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH) + if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH || + block_size > VSI_NN_MAX_IMAGE_WIDTH) { arrayFlg[0] = 1; } @@ -490,6 +491,8 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) float src0Scale = 1; int32_t dstZP = 0; float dstScale = 1; + int32_t remainder = 0; + int32_t width = 0; uint32_t pack_key = 0; @@ -546,6 +549,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) indices_num *= (int32_t)(input1_shape->data[i]); } batch = (int32_t)(input1_shape->data[input_dims1 - 1]); + width = (int32_t)(input1_shape->data[0]); shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; @@ -562,6 +566,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) (IN0_TYPE | (OUT_TYPE << 8)) pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype); + remainder = indices_num % 4; { uint16_t M0 = 0; @@ -656,6 +661,8 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) { status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch); } + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder); CHECK_STATUS_FAIL_GOTO(status, OnError ); OnError: @@ -763,20 +770,36 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); - int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); - int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); + int32_t block_size = 1; + int32_t block_num = 1; + int32_t axis_num = 0; int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" ); int32_t axis0_flg = 0; - int32_t is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0; + int32_t is_array = 0; int32_t is_batch = batch_dims > 0 ? 1 : 0; vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3; - int32_t i = 0; + vsi_size_t *input_size = inputs[0]->attr.size; + uint32_t i = 0; + uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); + for (i = 0; i < (uint32_t)axis; ++i) + { + block_size *= (int32_t)input_size[i]; + } + + axis_num = (int32_t)input_size[axis]; + + for (i = axis + 1; i < r_rank - batch_dims; ++i) + { + block_num *= (int32_t)input_size[i]; + } + + is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0; + if (axis == 0) { status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array); diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c index 1bfdb49..631cfd9 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c @@ -47,11 +47,10 @@ typedef enum } _internal_kernel_e; #define _GRUCELL_ACTIVATION_KERNEL_SOURCE "grucell_activation" -#define _GRUCELL_ACTIVATION_KERNEL_NAME CVIVANTE_NAMESPACE("evis.grucell_activation") #define _CDNN_KERNEL_SOURCE0 "grucell_cdnn_activation" #define _CDNN_KERNEL_SOURCE1 "grucell_cdnn_activation_u8" -#define _GRUCELL_ACTIVATION_CDNN_KERNEL_NAME CVIVANTE_NAMESPACE("evis.grucell_cdnn_activation") +#define _KERNEL_SOURCE2 "grucell_cdnn_activation_bf16" typedef enum _batch_fisrt_layerout_e { @@ -114,6 +113,11 @@ static const _kernel_map_type _grucell_activation_kernel_map[] = PACK_KERNEL_MAP( U8, U8, U8, U8, hsigmoid, VSI_NN_ACT_TANH, CN), PACK_KERNEL_MAP( F16, F16, F16, F16, hsigmoid, VSI_NN_ACT_TANH, CN), PACK_KERNEL_MAP( F16, F16, F16, U8, hsigmoid, VSI_NN_ACT_TANH, CN), + + PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, sigmoid, VSI_NN_ACT_TANH, NC), + PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, sigmoid, VSI_NN_ACT_TANH, CN), + PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, hsigmoid, VSI_NN_ACT_TANH, NC), + PACK_KERNEL_MAP( BF16, BF16, BF16, BF16, hsigmoid, VSI_NN_ACT_TANH, CN), }; static const _kernel_map_type _grucell_cunn_sep_activation_kernel_map[] = @@ -130,6 +134,12 @@ static const _kernel_map_type _grucell_cunn_sep_activation_kernel_map[] = PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ), PACK_KERNEL_CDNN_SEP_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN_FULL, _CDNN_KERNEL_SOURCE1 ), + + PACK_KERNEL_CDNN_SEP_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _KERNEL_SOURCE2 ), + + PACK_KERNEL_CDNN_SEP_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _KERNEL_SOURCE2 ), + + PACK_KERNEL_CDNN_SEP_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN_FULL, _KERNEL_SOURCE2 ), }; static const _kernel_map_type _grucell_cunn_activation_kernel_map[] = @@ -142,6 +152,10 @@ static const _kernel_map_type _grucell_cunn_activation_kernel_map[] = PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _CDNN_KERNEL_SOURCE1 ), PACK_KERNEL_CDNN_MAP( U8, U8, U8, U8, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _CDNN_KERNEL_SOURCE1 ), + + PACK_KERNEL_CDNN_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, NC, _KERNEL_SOURCE2 ), + + PACK_KERNEL_CDNN_MAP( BF16, BF16, BF16, BF16, VSI_NN_ACT_SIGMOID, VSI_NN_ACT_TANH, CN, _KERNEL_SOURCE2 ), }; /* @@ -322,6 +336,37 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer) "tensorScale", &tensorScale ); CHECK_STATUS_FAIL_GOTO(status, final ); } + break; + case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "tensorZP", &tensorZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "tensorScale", &tensorScale ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; default: break; } @@ -604,6 +649,34 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } break; + case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; default: break; } diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c index 9ad5852..d49d444 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c @@ -80,9 +80,11 @@ typedef struct static const _kernel_map_type _grucell_activation_sma_kernel_map[] = { - PACK_KERNEL_MAP(F16, F16, F16, F16), + PACK_KERNEL_MAP(F16, F16, F16, F16), + PACK_KERNEL_MAP(BF16, BF16, BF16, BF16), - PACK_KERNEL_MAP_2D(F16, F16, F16, F16), + PACK_KERNEL_MAP_2D(F16, F16, F16, F16), + PACK_KERNEL_MAP_2D(BF16, BF16, BF16, BF16), }; /* @@ -200,6 +202,45 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } break; + case _PACK_A_GRUCELL_ACTIVATION_SMA_KEY(BF16, BF16, BF16, BF16): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; default: break; } diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c index 7adf6bf..63360b4 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c @@ -68,14 +68,16 @@ typedef struct static const _kernel_map_type _grucell_activation_z_h_kernel_map[] = { // Register kernel here - PACK_KERNEL_MAP( U8, F16, U8, SIGMOID ), - PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ), - PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ), - PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), - PACK_KERNEL_MAP( U8, F16, U8, HSIGMOID ), - PACK_KERNEL_MAP( I8, F16, I8, HSIGMOID ), - PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ), - PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( U8, F16, U8, SIGMOID ), + PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ), + PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID ), + PACK_KERNEL_MAP( U8, F16, U8, HSIGMOID ), + PACK_KERNEL_MAP( I8, F16, I8, HSIGMOID ), + PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( BF16, BF16, BF16, HSIGMOID ), }; /* @@ -218,6 +220,34 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } break; + case _PACK_SELECT_KEY(BF16, BF16, BF16): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; case _PACK_SELECT_KEY(U8, F16, U8): case _PACK_SELECT_KEY(I8, F16, I8): case _PACK_SELECT_KEY(I16, F16, I16): diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c index afd8723..e3a2899 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c @@ -67,14 +67,16 @@ typedef struct static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] = { // Register kernel here - PACK_KERNEL_MAP( U8, F16, F16, SIGMOID ), - PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ), - PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ), - PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), - PACK_KERNEL_MAP( U8, F16, F16, HSIGMOID ), - PACK_KERNEL_MAP( I8, F16, F16, HSIGMOID ), - PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ), - PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( U8, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID ), + PACK_KERNEL_MAP( U8, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( I8, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( BF16, BF16, BF16, HSIGMOID ), }; /* @@ -194,6 +196,34 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } break; + case _PACK_SELECT_KEY(BF16, BF16, BF16): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; case _PACK_SELECT_KEY(U8, F16, F16): case _PACK_SELECT_KEY(I8, F16, F16): case _PACK_SELECT_KEY(I16, F16, F16): diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c index 60d932b..f53a56a 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c @@ -70,14 +70,16 @@ typedef struct static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] = { // Register kernel here - PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, TANH ), - PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, TANH ), - PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, TANH ), - PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, TANH ), - PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, SIGMOID ), - PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, SIGMOID ), - PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ), - PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ), + PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, TANH ), + PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, TANH ), + PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, TANH ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, TANH ), + PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, TANH ), + PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, SIGMOID ), + PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, SIGMOID ), + PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ), + PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, SIGMOID ), }; @@ -224,6 +226,34 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } break; + case _PACK_SELECT_KEY(BF16, BF16, BF16): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; case _PACK_SELECT_KEY(U8, F16, U8): case _PACK_SELECT_KEY(I8, F16, I8): case _PACK_SELECT_KEY(I16, F16, I16): diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c index 95232b9..46ab93f 100644 --- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c @@ -439,6 +439,32 @@ static const _kernel_map_type _lstmunit_activation_kernel_map[] = GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I16, F16, HARD_SIGMOID, SP) GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, I8, F16, HARD_SIGMOID, SP) GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, U8, U8, F16, HARD_SIGMOID, SP) + + /* BF16 type */ + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, BF16, BF16, BF16, SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, BF16, BF16, BF16, SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, BF16, BF16, BF16, SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, BF16, BF16, BF16, SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, BF16, BF16, BF16, SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, BF16, BF16, BF16, SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, BF16, BF16, BF16, SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, BF16, BF16, BF16, SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, BF16, BF16, BF16, SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, BF16, BF16, BF16, SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, BF16, BF16, BF16, SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, BF16, BF16, BF16, SIGMOID, CS) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CLP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, LP) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 1, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CL) + GEN_LSTMUNIT_STRUCT_ITEMS(1, 0, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, L) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, BP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, B) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, CBP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 1, 0, BF16, BF16, BF16, HARD_SIGMOID, CB) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, SP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 0, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, S) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 1, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CSP) + GEN_LSTMUNIT_STRUCT_ITEMS(0, 1, 0, 0, 0, BF16, BF16, BF16, HARD_SIGMOID, CS) }; @@ -1135,6 +1161,26 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer) 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP) { @@ -1152,31 +1198,41 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer) if ( cellFormat == F16 ) { - vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4); } if ( dstFormat == F16 ) { - vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else if ( dstFormat != BF16 ) + { + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + + if ( cellFormat == BF16 && dstFormat == BF16) + { + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", &uniExtractOddData_2x8); } else { - vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4); } + CHECK_STATUS_FAIL_GOTO(status, final ); - vsi_nn_kernel_gpu_add_param(node, "uniFp16toFp32_4x4", &uniFp16toFp32_4x4); - vsi_nn_kernel_gpu_add_param(node, "logE", &logE); - vsi_nn_kernel_gpu_add_param(node, "twoLogE", &twoLogE); - vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); - vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); - vsi_nn_kernel_gpu_add_param(node, "forget_bias", &forget_bias); - vsi_nn_kernel_gpu_add_param(node, "clip_Min_F", clip_Min_F); - vsi_nn_kernel_gpu_add_param(node, "clip_Max_F", clip_Max_F); - + status = vsi_nn_kernel_gpu_add_param(node, "logE", &logE); + status |= vsi_nn_kernel_gpu_add_param(node, "twoLogE", &twoLogE); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); + status |= vsi_nn_kernel_gpu_add_param(node, "forget_bias", &forget_bias); + status |= vsi_nn_kernel_gpu_add_param(node, "clip_Min_F", clip_Min_F); + status |= vsi_nn_kernel_gpu_add_param(node, "clip_Max_F", clip_Max_F); if ( !_is_ln && input_attr[S_INPUT_FC_F]->dtype == F16 ) { - vsi_nn_kernel_gpu_add_param(node, "uniFp16AddFp16toFp32_4x4", &uniFp16AddFp16toFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16AddFp16toFp32_4x4", &uniFp16AddFp16toFp32_4x4); } + CHECK_STATUS_FAIL_GOTO(status, final ); if (input_attr[S_INPUT_FC_F]->dtype == U8 && input_attr[S_INPUT_FC_F]->quant == VSI_NN_KERNEL_QUANT_ASYMM) @@ -1380,8 +1436,8 @@ static vsi_status _query_kernel vx_param_description_t * param_def = NULL; size_t param_def_size = _LSTMUNIT_ACTIVATION_MAX_PARAM_NUM; vx_kernel_initialize_f initializer = _lstmunit_activation_initializer; - uint32_t key; - uint32_t i; + uint32_t key = 0; + uint32_t i = 0; set_vx_param_description_t( lstm_activation, ¶m_def ); diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c index f5dc60b..1b15caa 100644 --- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -36,6 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" __BEGIN_DECLS @@ -1576,21 +1577,22 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_CROSS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + vsi_nn_tensor_t* tmp_inputs[2] = {NULL}; + vsi_nn_tensor_t* tmp_outputs[1] = {NULL}; int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); int32_t adjointA = vsi_nn_kernel_param_get_int32( params, "adjointA" ); int32_t adjointB = vsi_nn_kernel_param_get_int32( params, "adjointB" ); - uint32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" ); - size_t tmp_size = 0; - uint32_t* size_axis_in_out = NULL; - uint32_t* stride_axis_in_out = NULL; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + uint32_t new_rank[3] = {0}; vsi_size_t M = inputs[0]->attr.size[1]; vsi_size_t K = inputs[0]->attr.size[0]; vsi_size_t N = inputs[1]->attr.size[0]; vsi_size_t depthA = 1, depthB = 1; - size_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "size_axis_inner_outer", &tmp_size); - stride_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "stride_axis_inner_outer", &tmp_size); + uint32_t cross_flg = 0; + uint32_t size_axis_in_out[3] = {0}; + uint32_t stride_axis_in_out[9] = {0}; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); @@ -1609,35 +1611,62 @@ static vsi_nn_kernel_node_t _setup return NULL; } + status = vsi_nn_kernel_optimize_matrixmul_broadcast_shape( + inputs[0]->attr.size, + inputs[1]->attr.size, + outputs[0]->attr.size, + inputs[0]->attr.dim_num, + inputs[1]->attr.dim_num, + outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], new_rank, + &cross_flg, size_axis_in_out, stride_axis_in_out); + if (status) + { + tmp_inputs[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], new_rank[0]); + tmp_inputs[1] = vsi_nn_reshape_tensor(graph, inputs[1], shapes[1], new_rank[1]); + tmp_outputs[0] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[2], new_rank[2]); + + M = tmp_inputs[0]->attr.size[1]; + K = tmp_inputs[0]->attr.size[0]; + N = tmp_inputs[1]->attr.size[0]; + } + else + { + VSILOGE("illegal inputs shape"); + status = VSI_FAILURE; + goto final; + } + if (transposeA) { - K = inputs[0]->attr.size[1]; - M = inputs[0]->attr.size[0]; + K = tmp_inputs[0]->attr.size[1]; + M = tmp_inputs[0]->attr.size[0]; } else if (transposeB) { - N = inputs[1]->attr.size[1]; + N = tmp_inputs[1]->attr.size[1]; } - depthA = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; - depthB = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; + depthA = tmp_inputs[0]->attr.dim_num > 2 ? tmp_inputs[0]->attr.size[2] : 1; + depthB = tmp_inputs[1]->attr.dim_num > 2 ? tmp_inputs[1]->attr.size[2] : 1; + if (M == 1 && depthB == 1 && depthA > 1) { vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; - shape[0] = inputs[0]->attr.size[0]; - shape[1] = inputs[0]->attr.size[2]; + shape[0] = tmp_inputs[0]->attr.size[0]; + shape[1] = tmp_inputs[0]->attr.size[2]; shape[2] = 1; - shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); + shape[3] = tmp_inputs[0]->attr.dim_num > 3 ? tmp_inputs[0]->attr.size[3] : 1; + rs_input = vsi_nn_kernel_tensor_reshape( tmp_inputs[0]->t, shape, 4 ); - shape[0] = outputs[0]->attr.size[0]; - shape[1] = outputs[0]->attr.size[2]; + shape[0] = tmp_outputs[0]->attr.size[0]; + shape[1] = tmp_outputs[0]->attr.size[2]; shape[2] = 1; - shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; - rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); + shape[3] = tmp_outputs[0]->attr.dim_num > 3 ? tmp_outputs[0]->attr.size[3] : 1; + rs_output = vsi_nn_kernel_tensor_reshape( tmp_outputs[0]->t, shape, 4 ); } - status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB, cross_flg ); + status = _query_kernel( tmp_inputs, tmp_outputs, kernel, transposeA, transposeB, cross_flg ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -1649,13 +1678,13 @@ static vsi_nn_kernel_node_t _setup if (rs_input) { tmp_params[0] = rs_input; - tmp_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t); + tmp_params[1] = (vsi_nn_kernel_node_param_t)(tmp_inputs[1]->t); tmp_params[2] = rs_output; } else { vsi_nn_kernel_node_pack_io( tmp_params, param_num, - inputs, 2, outputs, 1 ); + tmp_inputs, 2, tmp_outputs, 1 ); } tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeB ); @@ -1725,6 +1754,10 @@ static vsi_nn_kernel_node_t _setup } } } +final: + vsi_safe_release_tensor( tmp_inputs[0] ); + vsi_safe_release_tensor( tmp_inputs[1] ); + vsi_safe_release_tensor( tmp_outputs[0] ); if (rs_input) { vsi_nn_kernel_tensor_release( &rs_input ); diff --git a/src/tim/vx/internal/src/kernel/evis/pool_evis.c b/src/tim/vx/internal/src/kernel/evis/pool_evis.c new file mode 100644 index 0000000..25ff3c2 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pool_evis.c @@ -0,0 +1,374 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" + +__BEGIN_DECLS + +#define KERNEL_SOURCE_0 "maxpool", + + typedef enum +{ + _error = -1, + _MAX = 0, + _AVG +} vsi_nn_pool_type_e; + +#define HASH_POOL_KEY(_input_type, _output_type, _pool_type, _image_2d) \ + ((_input_type << 24) | (_output_type << 16) | (_pool_type << 8) | (_image_2d)) + +#define HASH_MAXPOOL_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.maxpool_"#SRC_TYPE"to"#DST_TYPE) + +#define MAXPOOL_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_POOL_KEY(IN0_TYPE, OUT_TYPE, _MAX, 0), \ + HASH_MAXPOOL_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + MAXPOOL_KERNELS(F16, F16, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(BF16, BF16, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(I8, I8, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(U8, U8, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(I16, I16, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(U8, F16, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(I8, F16, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(I16, F16, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(F16, I8, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(F16, U8, KERNEL_SOURCE_0) + MAXPOOL_KERNELS(F16, I16, KERNEL_SOURCE_0) +}; + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +DEF_KERNEL_INITIALIZER(_maxpool_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + float input_zp = 0.0f; + float input_scale = 1.0f; + float output_zp = 0; + float output_scale = 1.0f; + float inout_scale = 1.0f; + float inout_tail = 0.0f; + int32_t width = 0; + int32_t height = 0; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_size_array_t * out_shape = NULL; + uint32_t pack_key = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_shape = attr[1]->shape; + + width = (int32_t)attr[0]->shape->data[0]; + height = (int32_t)attr[0]->shape->data[1]; + + input_scale = attr[0]->scale; + input_zp = (float)attr[0]->zero_point; + output_scale = attr[1]->scale; + output_zp = (float)attr[1]->zero_point; + + inout_scale = input_scale / output_scale; + inout_tail = output_zp - input_zp * inout_scale; + +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ + (IN0_TYPE | ( OUT_TYPE << 16)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]; + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvF16toFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inout_scale ); + status |= vsi_nn_kernel_gpu_add_param( node, "inout_tail", &inout_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, "width", &width ); + status |= vsi_nn_kernel_gpu_add_param( node, "height", &height ); + CHECK_STATUS_FAIL_GOTO(status, final); + + switch( pack_key ) + { + case _PACK_SELECT_KEY( I8, I8 ): + case _PACK_SELECT_KEY( U8, U8 ): + case _PACK_SELECT_KEY( I16, I16 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( F16, I8 ): + case _PACK_SELECT_KEY( F16, U8 ): + case _PACK_SELECT_KEY( F16, I16 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvF16toFp32_4x4", &uniConvF16toFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY( BF16, BF16 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + } + +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _maxpool_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + int32_t pool_type, + vsi_nn_kernel_t* kernel + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + size_t i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + key = HASH_POOL_KEY( input0_dtype, output_dtype, pool_type, 0 ); + + for ( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _maxpool_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t pool_type = vsi_nn_kernel_param_get_int32( params, "pool_type" ); + int32_t pool_size_x = vsi_nn_kernel_param_get_int32( params, "pool_size_x" ); + int32_t pool_size_y = vsi_nn_kernel_param_get_int32( params, "pool_size_y" ); + int32_t pool_pad_x_left = vsi_nn_kernel_param_get_int32( params, "pool_pad_x_left" ); + int32_t pool_pad_y_top = vsi_nn_kernel_param_get_int32( params, "pool_pad_y_top" ); + int32_t stride_x = vsi_nn_kernel_param_get_int32( params, "stride_x" ); + int32_t stride_y = vsi_nn_kernel_param_get_int32( params, "stride_y" ); + int32_t dilation_x = vsi_nn_kernel_param_get_int32( params, "dilation_x" ); + int32_t dilation_y = vsi_nn_kernel_param_get_int32( params, "dilation_y" ); + int32_t kernel_dia_x = pool_size_x * dilation_x; + int32_t kernel_dia_y = pool_size_y * dilation_y; + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, pool_type, kernel ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 2; + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM, + inputs, 1, outputs, 1 ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_x_left ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pool_pad_y_top ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_dia_y ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_x ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_y ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + vsi_nn_kernel_scalar_release( &tmp_params[8] ); + vsi_nn_kernel_scalar_release( &tmp_params[9] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pool, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c index a0d76f4..c4dfe0a 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c @@ -117,6 +117,17 @@ static vx_param_description_t vxPreProcessNv12Kernel_param_def[] = }; #define _EVIS_PRE_PROCESS_NV12_PARAM_NUM _cnt_of_array(vxPreProcessNv12Kernel_param_def) +static vsi_bool _check_nv12_type_from_env() +{ + vsi_bool ret = FALSE; + char* env_s = vsi_nn_getenv("VSI_NN_ENABLE_OCV_NV12"); + if (env_s) + { + ret = TRUE; + } + return ret; +} + DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) ( vsi_nn_kernel_node_t node, @@ -145,6 +156,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + vsi_bool ocv_nv12 = _check_nv12_type_from_env(); VSI_UNREFERENCED(param_size); @@ -208,7 +220,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertNV12toB_4x4 = {{ 0x05050505, // TCfg 0x04040404, // ASelt @@ -239,6 +250,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractYtoShortSub16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{ 0x99999999, // TCfg 0x44444444, // ASelt @@ -259,6 +281,61 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, + 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, + 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if (ocv_nv12) + { + uniConvertNV12toB_4x4.data[2] = 0x00010000; + uniConvertNV12toB_4x4.data[3] = 0x00230022; + uniConvertNV12toB_4x4.data[8] = 0x40093ca7; + uniConvertNV12toB_4x4.data[10] = 0x40093ca7; + uniConvertNV12toB_4x4.data[12] = 0x40093ca7; + uniConvertNV12toB_4x4.data[14] = 0x40093ca7; + + uniConvertNV12toG_4x4.data[2] = 0x01010100; + uniConvertNV12toG_4x4.data[3] = 0x03230322; + uniConvertNV12toG_4x4.data[8] = 0x36413ca7; + uniConvertNV12toG_4x4.data[9] = 0x00003a81; + uniConvertNV12toG_4x4.data[10] = 0x36413ca7; + uniConvertNV12toG_4x4.data[11] = 0x00003a81; + uniConvertNV12toG_4x4.data[12] = 0x36413ca7; + uniConvertNV12toG_4x4.data[13] = 0x00003a81; + uniConvertNV12toG_4x4.data[14] = 0x36413ca7; + uniConvertNV12toG_4x4.data[15] = 0x00003a81; + + uniConvertNV12toR_4x4.data[2] = 0x00110010; + uniConvertNV12toR_4x4.data[3] = 0x00330032; + uniConvertNV12toR_4x4.data[8] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[10] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[12] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[14] = 0x3e623ca7; + + uniExtractUVtoCharSub128_2x8.data[2] = 0x03020100; + uniExtractUVtoCharSub128_2x8.data[3] = 0x07060504; + + uniExtractYtoShortSub16_2x8.data[0] = 0x99999999; + uniExtractYtoShortSub16_2x8.data[1] = 0x44444444; + uniExtractYtoShortSub16_2x8.data[4] = 0xaaaaaaaa; + uniExtractYtoShortSub16_2x8.data[8] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[9] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[10] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[11] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[12] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[13] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[14] = 0x00010001; + uniExtractYtoShortSub16_2x8.data[15] = 0x00010001; + } status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4); @@ -266,12 +343,15 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_2x8", &uniExtractYtoShortSub16_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g); status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r); status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4); + CHECK_STATUS_FAIL_GOTO(status, OnError); switch( attr[0]->dtype ) { case U8: @@ -335,6 +415,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f; float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; float resize = 0.0f; + vsi_bool ocv_nv12 = _check_nv12_type_from_env(); vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * out_shape = NULL; @@ -445,6 +526,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertYtoShortSub16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ 0x11111111, // TCfg 0x11110000, // ASelt @@ -487,11 +579,64 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, + 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, + 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if (ocv_nv12) + { + uniConvertNV12toB_4x4.data[2] = 0x00010000; + uniConvertNV12toB_4x4.data[3] = 0x00230022; + uniConvertNV12toB_4x4.data[8] = 0x40093ca7; + uniConvertNV12toB_4x4.data[10] = 0x40093ca7; + uniConvertNV12toB_4x4.data[12] = 0x40093ca7; + uniConvertNV12toB_4x4.data[14] = 0x40093ca7; + + uniConvertNV12toG_4x4.data[2] = 0x01010100; + uniConvertNV12toG_4x4.data[3] = 0x03230322; + uniConvertNV12toG_4x4.data[8] = 0x36413ca7; + uniConvertNV12toG_4x4.data[9] = 0x00003a81; + uniConvertNV12toG_4x4.data[10] = 0x36413ca7; + uniConvertNV12toG_4x4.data[11] = 0x00003a81; + uniConvertNV12toG_4x4.data[12] = 0x36413ca7; + uniConvertNV12toG_4x4.data[13] = 0x00003a81; + uniConvertNV12toG_4x4.data[14] = 0x36413ca7; + uniConvertNV12toG_4x4.data[15] = 0x00003a81; + + uniConvertNV12toR_4x4.data[2] = 0x00110010; + uniConvertNV12toR_4x4.data[3] = 0x00330032; + uniConvertNV12toR_4x4.data[8] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[10] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[12] = 0x3e623ca7; + uniConvertNV12toR_4x4.data[14] = 0x3e623ca7; + + uniConvertYtoShortSub16_2x8.data[0] = 0x99999999; + uniConvertYtoShortSub16_2x8.data[1] = 0x44444444; + uniConvertYtoShortSub16_2x8.data[4] = 0xaaaaaaaa; + uniConvertYtoShortSub16_2x8.data[8] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[9] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[10] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[11] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[12] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[13] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[14] = 0x00010001; + uniConvertYtoShortSub16_2x8.data[15] = 0x00010001; + } status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYtoShortSub16_2x8", &uniConvertYtoShortSub16_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16); status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16); status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); @@ -506,6 +651,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8); } + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4); CHECK_STATUS_FAIL_GOTO(status, OnError ); status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c index 61d421d..98b9dbd 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c @@ -249,6 +249,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer) 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, + 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, + 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4); @@ -262,6 +274,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4); + CHECK_STATUS_FAIL_GOTO(status, OnError); switch( attr[0]->dtype ) { case U8: @@ -461,6 +475,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer) 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, + 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, + 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4); @@ -477,6 +503,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_4x4", &uniExtractYtoShortSub16_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4); CHECK_STATUS_FAIL_GOTO(status, OnError ); switch( attr[0]->dtype ) diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c index 596d528..ff8d5c0 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c @@ -664,9 +664,15 @@ static vsi_nn_kernel_node_t _setup hashkeys[1] = BILINEAR_NHWC_BOUND_HASH_KEY( in_dtype, out_dtype, (vsi_size_t)up_scale ); status = _query_kernel( ikernels[0], hashkeys[0], 0); - CHECK_STATUS_FAIL_GOTO(status, final ); + if (status != VSI_SUCCESS) + { + goto final; + } status = _query_kernel( kernel, hashkeys[1], 1); - CHECK_STATUS_FAIL_GOTO(status, final ); + if (status != VSI_SUCCESS) + { + goto final; + } shapes[0][0] = depth * inputs[0]->attr.size[1]; shapes[0][1] = inputs[0]->attr.size[2]; diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c index 43ea15c..b59bccf 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c @@ -532,10 +532,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer) width = (width + 15) / 16; } - input0_zp = attr[0]->asymm.zero_point; - input0_scale = attr[0]->asymm.scale; - output_zp = attr[1]->asymm.zero_point; - output_scale = 1.0f / attr[1]->asymm.scale; + input0_zp = attr[0]->zero_point; + input0_scale = attr[0]->scale; + output_zp = attr[1]->zero_point; + output_scale = 1.0f / attr[1]->scale; gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; @@ -670,10 +670,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer) update_width = (int32_t)(attr[1]->shape->data[0]); index_num = (int32_t)(attr[0]->shape->data[1]); - input1_zp = attr[1]->asymm.zero_point; - input1_scale = attr[1]->asymm.scale; - output_zp = attr[2]->asymm.zero_point; - output_scale = 1.0f / attr[2]->asymm.scale; + input1_zp = attr[1]->zero_point; + input1_scale = attr[1]->scale; + output_zp = attr[2]->zero_point; + output_scale = 1.0f / attr[2]->scale; if (coord_dim == 5) { @@ -916,10 +916,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer) } width = element_size / 8; - input_zp0 = attr[0]->asymm.zero_point; - input_scale0 = attr[0]->asymm.scale; - output_zp = attr[1]->asymm.zero_point; - output_scale = attr[1]->asymm.scale; + input_zp0 = attr[0]->zero_point; + input_scale0 = attr[0]->scale; + output_zp = attr[1]->zero_point; + output_scale = attr[1]->scale; if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) { @@ -933,9 +933,14 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer) gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - - gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) - / gpu_param.global_scale[0], 4); + if (element_size < 8) + { + gpu_param.global_size[0] = element_size; + } + else + { + gpu_param.global_size[0] = width; + } gpu_param.global_size[1] = 1; gpu_param.global_size[2] = 1; @@ -1006,7 +1011,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer) int32_t coord_dim = 0; int32_t strides[VSI_NN_MAX_DIM_NUM] = {0}; int32_t coord_strides[8] = {0}; - int32_t *coord_strides1 = coord_strides + 4; + int32_t coord_strides1[4] = {0}; int32_t input2_zp = 0; int32_t i = 0; @@ -1046,13 +1051,14 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer) width = block_size / 4; } - input2_zp = attr[1]->asymm.zero_point; + input2_zp = attr[1]->zero_point; coord_strides[coord_dim - 1] = 1; for (i = 0; i < coord_dim - 1; i++) { coord_strides[i] = strides[coord_dim - 2 - i]; } + memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t)); gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; @@ -1165,7 +1171,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer) int32_t coord_dim = 0; int32_t strides[VSI_NN_MAX_DIM_NUM] = {0}; int32_t coord_strides[8] = {0}; - int32_t *coord_strides1 = coord_strides + 4; + int32_t coord_strides1[4] = {0}; float output_zp = 0; float input_scale = 1.0f; float output_scale = 1.0f; @@ -1202,9 +1208,9 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer) update_width = (int32_t)(attr[1]->shape->data[0]); index_num = (int32_t)(attr[0]->shape->data[1]); - input_scale = attr[1]->asymm.scale; - output_scale = attr[2]->asymm.scale; - output_zp = (float)attr[2]->asymm.zero_point; + input_scale = attr[1]->scale; + output_scale = attr[2]->scale; + output_zp = (float)attr[2]->zero_point; if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) { input_scale = 1.0f; @@ -1220,6 +1226,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer) { coord_strides[i] = strides[coord_dim - 2 - i]; } + memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t)); width = block_size; if (block_size % 4 == 0) @@ -1337,9 +1344,14 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer) gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - - gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) - / gpu_param.global_scale[0], 4); + if (element_size < 8) + { + gpu_param.global_size[0] = element_size; + } + else + { + gpu_param.global_size[0] = width; + } gpu_param.global_size[1] = 1; gpu_param.global_size[2] = 1; diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c index 18919b4..547254f 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c @@ -479,6 +479,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape vsi_size_t* temp_shape_y = NULL; vsi_size_t* temp_shape_output = NULL; vsi_size_t temp_rank = 0; + vsi_bool exceed_maxsize = FALSE; #define _swap_size(a, b, tmp) \ { \ @@ -490,6 +491,27 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape VSI_UNREFERENCED(rank_x); VSI_UNREFERENCED(rank); + for (i = 0; i < rank_output; i++) + { + if (shape_output[i] > GPU_TENSOR_MAX_WIDTH) + { + exceed_maxsize = TRUE; + } + } + + if (exceed_maxsize) + { + for (i = 0; i < rank_output; i++) + { + out_shape_x[i] = shape_x[i]; + out_shape_y[i] = multiples[i]; + out_shape_output[i] = shape_output[i]; + } + *out_rank_output = rank_output; + ret = TRUE; + goto final; + } + temp_shape_x = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t)); if (temp_shape_x == NULL) { diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index 6c6dda9..974ad58 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -156,5 +156,17 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(inverse_sigmoid) #if (VX_TENSOR_SELECT_VX_SUPPORT) REGISTER_VX_FIRST_KERNEL_SELECTOR(select) #endif +#if (VX_TENSOR_POW_API_SUPPORT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(pow) +#endif +#if (VX_TENSOR_GATHER_API_SUPPORT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(gather) +#endif +#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops) +#endif +#if (VX_TENSOR_TILE_API_SUPPORT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(tile) +#endif __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vx/gather_vx.c b/src/tim/vx/internal/src/kernel/vx/gather_vx.c new file mode 100644 index 0000000..88e9f45 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/gather_vx.c @@ -0,0 +1,82 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_TENSOR_GATHER_API_SUPPORT) + +#define REGISTER_GATHEROPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_GATHEROPENVX_KERNEL( gather ) +{ + vx_node node = NULL; + int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis"); + int32_t batch_dims = vsi_nn_kernel_param_get_int32(params, "batch_dims"); + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + node = vxTensorGatherNode(graph->g, + inputs[0]->t, + inputs[1]->t, + axis, + batch_dims, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* gather() */ + +#undef REGISTER_GATHEROPENVX_KERNEL + +#endif diff --git a/src/tim/vx/internal/src/kernel/vx/pow_vx.c b/src/tim/vx/internal/src/kernel/vx/pow_vx.c new file mode 100644 index 0000000..eb1bb71 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/pow_vx.c @@ -0,0 +1,73 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_TENSOR_POW_API_SUPPORT) + +#define REGISTER_POWOPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_POWOPENVX_KERNEL( pow ) +{ + vx_node node = vxTensorPowNode( graph->g, inputs[0]->t, inputs[1]->t, + outputs[0]->t ); + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + + return (vsi_nn_kernel_node_t)node; +} /* pow() */ + +#undef REGISTER_POWOPENVX_KERNEL + +#endif diff --git a/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c b/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c new file mode 100644 index 0000000..0d93b45 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c @@ -0,0 +1,83 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT) + +#define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops ) +{ + vx_node node = NULL; + + int32_t operation = vsi_nn_kernel_param_get_int32(params, "operation"); + vx_tensor inputs_tensor[2] = {NULL}; + + inputs_tensor[0] = inputs[0]->t; + inputs_tensor[1] = inputs[1]->t; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + + node = vxRelationalLayer(graph->g, + operation, + inputs_tensor, + (uint32_t)input_num, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* relational_ops() */ + +#undef REGISTER_RELATIONAL_OPS_OPENVX_KERNEL + +#endif diff --git a/src/tim/vx/internal/src/kernel/vx/tile_vx.c b/src/tim/vx/internal/src/kernel/vx/tile_vx.c new file mode 100644 index 0000000..fd65ee1 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/tile_vx.c @@ -0,0 +1,78 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_TENSOR_TILE_API_SUPPORT) + +#define REGISTER_TILEOPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_TILEOPENVX_KERNEL( tile ) +{ + vx_node node = NULL; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + node = vxTensorTileNode(graph->g, + inputs[0]->t, + inputs[1]->t, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* tile() */ + +#undef REGISTER_TILEOPENVX_KERNEL + +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl index de65186..bf9fd64 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl @@ -88,6 +88,8 @@ __kernel void cumsum_##name##toU8_axis2( \ \ src_type sum = (src_type)(0); \ uint4 dst = (uint4)(0); \ + int tmp_zp = convert_int_rte(output_zp); \ + dst.x = convert_uint_sat(tmp_zp); \ \ float cnt = 0.0f; \ \ @@ -252,6 +254,8 @@ __kernel void cumsum_##name##toU8_axis1( \ \ src_type sum = (src_type)(0); \ uint4 dst = (uint4)(0); \ + int tmp_zp = convert_int_rte(output_zp); \ + dst.x = convert_uint_sat(tmp_zp); \ \ float cnt = 0; \ \ @@ -416,6 +420,8 @@ __kernel void cumsum_##name##toU8_axis0( \ \ src_type sum = (src_type)(0); \ uint4 dst = (uint4)(0); \ + int tmp_zp = convert_int_rte(output_zp); \ + dst.x = convert_uint_sat(tmp_zp); \ \ float cnt = 0; \ \ @@ -487,4 +493,4 @@ __kernel void cumsum_##name##toU8_axis0( \ } \ } CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui) -CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef) \ No newline at end of file +CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl index 5fec847..3a90480 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl @@ -85,12 +85,15 @@ __kernel void cumsum_U8toU8_axis1_2D( uint4 sum = (uint4)(0); uint4 dst = (uint4)(0); + int tmp_zp = convert_int_rte(output_zp); + dst.x = convert_uint_sat(tmp_zp); + float cnt = 0; if(exclusive && rev) { coord.w = height - 1; - write_imageui(output, coord.zw, sum); + write_imageui(output, coord.zw, dst); for(coord.y = height - 1; coord.y > 0; coord.y--) { uint4 data = read_imageui(input, coord.xy); @@ -107,7 +110,7 @@ __kernel void cumsum_U8toU8_axis1_2D( } else if(exclusive) { - write_imageui(output, coord.zw, sum); + write_imageui(output, coord.zw, dst); for(coord.y = 0; coord.y < height - 1; coord.y++) { uint4 data = read_imageui(input, coord.xy); @@ -173,6 +176,8 @@ __kernel void cumsum_F32toU8_axis1_2D( float4 sum = (float4)(0); uint4 dst = (uint4)(0); + int tmp_zp = convert_int_rte(output_zp); + dst.x = convert_uint_sat(tmp_zp); float cnt = 0; @@ -331,13 +336,16 @@ __kernel void cumsum_U8toU8_axis0_2D( uint4 sum = (uint4)(0); uint4 dst = (uint4)(0); + int tmp_zp = convert_int_rte(output_zp); + dst.x = convert_uint_sat(tmp_zp); + float cnt = 0.0f; if(exclusive && rev) { coord.x = width - 1; coord.z = coord.x; - write_imageui(output, coord.zw, sum); + write_imageui(output, coord.zw, dst); for(; coord.x > 0; coord.x--) { uint4 data = read_imageui(input, coord.xy); @@ -355,7 +363,7 @@ __kernel void cumsum_U8toU8_axis0_2D( else if(exclusive) { coord.z = 0; - write_imageui(output, coord.zw, sum); + write_imageui(output, coord.zw, dst); for(coord.x = 0; coord.x < width - 1; coord.x++) { uint4 data = read_imageui(input, coord.xy); @@ -421,9 +429,10 @@ __kernel void cumsum_F32toU8_axis0_2D( float4 sum = (float4)(0); uint4 dst = (uint4)(0); + int tmp_zp = convert_int_rte(output_zp); + dst.x = convert_uint_sat(tmp_zp); float cnt = 0.0f; - if(exclusive && rev) { coord.x = width - 1; @@ -491,4 +500,4 @@ __kernel void cumsum_F32toU8_axis0_2D( write_imageui(output, coord.xy, dst); } } -} \ No newline at end of file +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis0.cl index 0bb51ba..d8d9972 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis0.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis0.cl @@ -1,3 +1,6 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + #define rlogE (0.693147182f) float LOG(float x) { @@ -5,16 +8,11 @@ float LOG(float x) return x * rlogE; } -__kernel void log_softmax_axis0_F32toF32 - ( +__kernel void log_softmax_axis0_F32toF32( __read_only image2d_array_t input, __write_only image2d_array_t output, - int axis, - float beta, - float scale, - float scaleOut, - float zpOut - ) + int axis, float beta, + float scale, float scaleOut, float zpOut) { int x = get_global_id(0); int y = get_global_id(1); @@ -58,16 +56,11 @@ __kernel void log_softmax_axis0_F32toF32 } } -__kernel void log_softmax_axis0_F32toF32_2D - ( +__kernel void log_softmax_axis0_F32toF32_2D( __read_only image2d_t input, __write_only image2d_t output, - int axis, - float beta, - float scale, - float scaleOut, - float zpOut - ) + int axis, float beta, + float scale, float scaleOut, float zpOut) { int x = get_global_id(0); int y = get_global_id(1); @@ -110,16 +103,11 @@ __kernel void log_softmax_axis0_F32toF32_2D } } -__kernel void log_softmax_axis0_U8toU8 - ( +__kernel void log_softmax_axis0_U8toU8( __read_only image2d_array_t input, __write_only image2d_array_t output, - int axis, - float beta, - float scale, - float scaleOut, - float zpOut - ) + int axis, float beta, + float scale, float scaleOut, float zpOut) { int x = get_global_id(0); int y = get_global_id(1); @@ -165,16 +153,11 @@ __kernel void log_softmax_axis0_U8toU8 } } -__kernel void log_softmax_axis0_U8toU8_2D - ( +__kernel void log_softmax_axis0_U8toU8_2D( __read_only image2d_t input, __write_only image2d_t output, - int axis, - float beta, - float scale, - float scaleOut, - float zpOut - ) + int axis, float beta, + float scale, float scaleOut, float zpOut) { int x = get_global_id(0); int y = get_global_id(1); @@ -217,4 +200,109 @@ __kernel void log_softmax_axis0_U8toU8_2D coord_in.x++; } } + +__kernel void log_softmax_axis0_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int width = get_image_width(input); + int4 coord_in = (int4)(0, y, z, 0); + float4 maxValue, src, dst = {0.0}; + uint4 data, val, out; + + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, maxValue, data, 16); + for (coord_in.x = 1; coord_in.x < width; ) + { + data = read_imageui(input, coord_in); + coord_in.x++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + maxValue = maxValue > src ? maxValue : src; + } + + float sum = 0.f; + for (coord_in.x = 0; coord_in.x < width; ) + { + data = read_imageui(input, coord_in); + coord_in.x++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + sum += exp2((src.x - maxValue.x) * scale); + } + + float logSum = LOG(sum); + for (coord_in.x = 0; coord_in.x < width; ) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + dst.x = (src.x - maxValue.x) * beta - logSum; + _viv_asm(COPY, val, dst, 16); + out = val >> 16; + write_imageui(output, coord_in, out); + coord_in.x++; + } +} + +__kernel void log_softmax_axis0_BF16toBF16_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int width = get_image_width(input); + int2 coord_in = (int2)(0, y); + float4 maxValue, src, dst = {0.0}; + uint4 data, val, out; + + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, maxValue, data, 16); + for (coord_in.x = 1; coord_in.x < width; ) + { + data = read_imageui(input, coord_in); + coord_in.x++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + maxValue = maxValue > src ? maxValue : src; + } + + float sum = 0.0f; + for (coord_in.x = 0; coord_in.x < width; ) + { + data = read_imageui(input, coord_in); + coord_in.x++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + sum += exp2((src.x - maxValue.x) * scale); + } + + float logSum = LOG(sum); + for (coord_in.x = 0; coord_in.x < width; ) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + dst.x = (src.x - maxValue.x) * beta - logSum; + _viv_asm(COPY, val, dst, 16); + out = val >> 16; + write_imageui(output, coord_in, out); + coord_in.x++; + } +} #undef rlogE diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis1.cl index e647014..487c401 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis1.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis1.cl @@ -1,3 +1,6 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + #define rlogE (0.693147182f) float LOG(float x) @@ -6,16 +9,11 @@ float LOG(float x) return x * rlogE; } -__kernel void log_softmax_axis1_F32toF32 - ( +__kernel void log_softmax_axis1_F32toF32( __read_only image2d_array_t input, __write_only image2d_array_t output, - int axis, - float beta, - float scale, - float scaleOut, - float zpOut - ) + int axis, float beta, + float scale, float scaleOut, float zpOut) { int x = get_global_id(0); int y = get_global_id(1); @@ -59,16 +57,11 @@ __kernel void log_softmax_axis1_F32toF32 } } -__kernel void log_softmax_axis1_F32toF32_2D - ( +__kernel void log_softmax_axis1_F32toF32_2D( __read_only image2d_t input, __write_only image2d_t output, - int axis, - float beta, - float scale, - float scaleOut, - float zpOut - ) + int axis, float beta, + float scale, float scaleOut, float zpOut) { int x = get_global_id(0); int y = get_global_id(1); @@ -111,16 +104,11 @@ __kernel void log_softmax_axis1_F32toF32_2D } } -__kernel void log_softmax_axis1_U8toU8 - ( +__kernel void log_softmax_axis1_U8toU8( __read_only image2d_array_t input, __write_only image2d_array_t output, - int axis, - float beta, - float scale, - float scaleOut, - float zpOut - ) + int axis, float beta, + float scale, float scaleOut, float zpOut) { int x = get_global_id(0); int y = get_global_id(1); @@ -166,16 +154,11 @@ __kernel void log_softmax_axis1_U8toU8 } } -__kernel void log_softmax_axis1_U8toU8_2D - ( +__kernel void log_softmax_axis1_U8toU8_2D( __read_only image2d_t input, __write_only image2d_t output, - int axis, - float beta, - float scale, - float scaleOut, - float zpOut - ) + int axis, float beta, + float scale, float scaleOut, float zpOut) { int x = get_global_id(0); int y = get_global_id(1); @@ -218,4 +201,111 @@ __kernel void log_softmax_axis1_U8toU8_2D coord_in.y++; } } + +__kernel void log_softmax_axis1_BF16oBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int height = get_image_height(input); + int4 coord_in = (int4)(x, 0, z, 0); + float4 maxValue, src, dst = {0.0}; + uint4 data, val, out; + + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, maxValue, data, 16); + for (coord_in.y = 1; coord_in.y < height; ) + { + data = read_imageui(input, coord_in); + coord_in.y++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + maxValue = maxValue > src ? maxValue : src; + } + + float sum = 0.f; + for (coord_in.y = 0; coord_in.y < height; ) + { + data = read_imageui(input, coord_in); + coord_in.y++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + sum += exp2((src.x - maxValue.x) * scale); + } + + float logSum = LOG(sum); + for (coord_in.y = 0; coord_in.y < height; ) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + dst.x = (src.x - maxValue.x) * beta - logSum; + + _viv_asm(COPY, val, dst, 16); + out = val >> 16; + + write_imageui(output, coord_in, out); + coord_in.y++; + } +} + +__kernel void log_softmax_axis1_BF16toBF16_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, float beta, + float scale, float scaleOut, float zpOut) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int height = get_image_height(input); + int2 coord_in = (int2)(x, 0); + float4 maxValue, src, dst = {0.0}; + uint4 data, val, out; + + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, maxValue, data, 16); + for (coord_in.y = 1; coord_in.y < height; ) + { + data = read_imageui(input, coord_in); + coord_in.y++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + maxValue = maxValue > src ? maxValue : src; + } + + float sum = 0.0f; + for (coord_in.y = 0; coord_in.y < height; ) + { + data = read_imageui(input, coord_in); + coord_in.y++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + sum += exp2((src.x - maxValue.x) * scale); + } + + float logSum = 1.0f * LOG(sum); + for (coord_in.y = 0; coord_in.y < height; ) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + dst.x = (src.x - maxValue.x) * beta - logSum; + _viv_asm(COPY, val, dst, 16); + out = val >> 16; + write_imageui(output, coord_in, out); + coord_in.y++; + } +} #undef rlogE diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis2.cl index d45ff39..a909f4d 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis2.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_axis2.cl @@ -1,3 +1,6 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + #define rlogE (0.693147182f) float LOG(float x) { @@ -112,4 +115,68 @@ __kernel void log_softmax_axis2_U8toU8 coord_in.z++; } } + +__kernel void log_softmax_axis2_BF16toBF16 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + float beta, + float scale, + float scaleOut, + float zpOut + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + int depth = get_image_array_size(input); + int4 coord_in = (int4)(x, y, 0, 0); + float4 maxValue; + float4 src, dst = {0.0}; + uint4 data, val, out; + + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, maxValue, data, 16); + for (coord_in.z = 1; coord_in.z < depth; ) + { + data = read_imageui(input, coord_in); + coord_in.z++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + maxValue = maxValue > src ? maxValue : src; + } + + // Compute sum. + float sum = 0.f; + for (coord_in.z = 0; coord_in.z < depth; ) + { + data = read_imageui(input, coord_in); + coord_in.z++; + data = data << 16; + _viv_asm(COPY, src, data, 16); + + sum += exp2((src.x - maxValue.x) * scale); + } + + // Compute result. + float logSum = LOG(sum); + for (coord_in.z = 0; coord_in.z < depth; ) + { + data = read_imageui(input, coord_in); + data = data << 16; + _viv_asm(COPY, src, data, 16); + + dst.x = (src.x - maxValue.x) * beta - logSum; + _viv_asm(COPY, val, dst, 16); + out = val >> 16; + write_imageui(output, coord_in, out); + coord_in.z++; + } +} #undef rlogE diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl new file mode 100644 index 0000000..e4cc547 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl @@ -0,0 +1,127 @@ +#pragma OPENCL EXTENSION CL_VIV_asm : enable + +__kernel void gemm_4x_F32F32toF32_2D( + __read_only image2d_t inputA, + __read_only image2d_t inputB, + __write_only image2d_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out + ) +{ + int offset0 = get_global_id(0) * K; + int offset1 = offset0 + K; + int offset2 = offset1 + K; + int offset3 = offset2 + K; + int out_offset = get_global_id(0); + int z = 0; + float4 sum = (float4)(0, 0, 0, 0); + + Image in0_tensor = create_image_from_image2d(inputA, 4); + __global float* in0_ptr = (__global float*)in0_tensor.ptr; + __global float* in0_ptr0 = in0_ptr + offset0; + __global float* in0_ptr1 = in0_ptr + offset1; + __global float* in0_ptr2 = in0_ptr + offset2; + __global float* in0_ptr3 = in0_ptr + offset3; + + Image in1_tensor = create_image_from_image2d(inputB, 4); + __global float* in1_ptr = (__global float*)in1_tensor.ptr; + + Image o_tensor = create_image_from_image2d(output, 4); + __global float* output_ptr = (__global float*)o_tensor.ptr + out_offset; + + int step = K >> 2; + for(z = 0; z < step; z++) + { + float4 tempA0, tempA1, tempA2, tempA3; + float4 tempB0; + + tempB0 = vload4(z, in1_ptr); + tempA0 = vload4(z, in0_ptr0); + tempA1 = vload4(z, in0_ptr1); + tempA2 = vload4(z, in0_ptr2); + tempA3 = vload4(z, in0_ptr3); + + sum.x += dot(tempA0, tempB0); + sum.y += dot(tempA1, tempB0); + sum.z += dot(tempA2, tempB0); + sum.w += dot(tempA3, tempB0); + } + + vstore4(sum, 0, output_ptr); + +} + +__kernel void gemm_4x_transa_F32F32toF32_2D( + __read_only image2d_t inputA, + __read_only image2d_t inputB, + __write_only image2d_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out + ) +{ + int offset0 = get_global_id(0); + int offset1 = M << 2; + + int z = 0; + float4 sum = (float4)(0, 0, 0, 0); + + Image in0_tensor = create_image_from_image2d(inputA, 4); + __global float* in0_ptr0 = (__global float*)in0_tensor.ptr + offset0; + __global float* in0_ptr1 = in0_ptr0 + M; + __global float* in0_ptr2 = in0_ptr1 + M; + __global float* in0_ptr3 = in0_ptr2 + M; + + Image in1_tensor = create_image_from_image2d(inputB, 4); + __global float* in1_ptr = (__global float*)in1_tensor.ptr; + + Image o_tensor = create_image_from_image2d(output, 4); + __global float* output_ptr = (__global float*)o_tensor.ptr + offset0; + + int step = K >> 2; + for(z = 0; z < step; z++) + { + float4 tempA0, tempA1, tempA2, tempA3; + float4 tempB0; + + tempB0 = vload4(z, in1_ptr); + tempA0 = vload4(0, in0_ptr0); + tempA1 = vload4(0, in0_ptr1); + tempA2 = vload4(0, in0_ptr2); + tempA3 = vload4(0, in0_ptr3); + + sum += tempA0 * tempB0.x; + sum += tempA1 * tempB0.y; + sum += tempA2 * tempB0.z; + sum += tempA3 * tempB0.w; + + in0_ptr0 = in0_ptr0 + offset1; + in0_ptr1 = in0_ptr1 + offset1; + in0_ptr2 = in0_ptr2 + offset1; + in0_ptr3 = in0_ptr3 + offset1; + + } + + vstore4(sum, 0, output_ptr); + +} + + + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl new file mode 100644 index 0000000..f87e9e4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl @@ -0,0 +1,217 @@ +#define VSI_FLOAT32_MIN (1.175494351e-38F) + +#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \ +__kernel void maxpool_##in_name##to##out_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int width, \ + int height, \ + int stride_x, \ + int stride_y, \ + int pad_x, \ + int pad_y, \ + int kernel_dia_x, \ + int kernel_dia_y, \ + int dilation_x, \ + int dilation_y, \ + float inout_scale, \ + float inout_tail) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \ + int4 coord_in = coord_out; \ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \ + \ + for(; pos_start.x < 0;) \ + { \ + pos_start.x += dilation_x; \ + } \ + for(; pos_start.y < 0;) \ + { \ + pos_start.y += dilation_y; \ + } \ + \ + pos_end = min(pos_end, (int2)(width, height)); \ + \ + src_type src0, maxVal; \ + maxVal.x = max_val; \ + \ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \ + { \ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \ + { \ + src0 = read_func(input, coord_in); \ + coord_in.x += dilation_x; \ + maxVal = max(src0, maxVal); \ + } \ + } \ + \ + float4 fValTmp; \ + fValTmp.x = maxVal.x * inout_scale + inout_tail; \ + dst_type dst = conv_func(fValTmp); \ + write_func(output, coord_out, dst.xxxx); \ +} +MAXPOOL_QINT(U32, U32, uint4, uint4, 0, read_imageui, write_imageui, convert_uint4_rte) +MAXPOOL_QINT(I32, I32, int4, int4, -2147483648, read_imagei, write_imagei, convert_int4_rte) + +__kernel void maxpool_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int width, + int height, + int stride_x, + int stride_y, + int pad_x, + int pad_y, + int kernel_dia_x, + int kernel_dia_y, + int dilation_x, + int dilation_y, + float inout_scale, + float inout_tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(gidx, gidy, gidz, 0); + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); + int4 coord_in = coord_out; + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); + + for(; pos_start.x < 0;) + { + pos_start.x += dilation_x; + } + for(; pos_start.y < 0;) + { + pos_start.y += dilation_y; + } + + pos_end = min(pos_end, (int2)(width, height)); + + float4 src0, maxVal; + maxVal.x = VSI_FLOAT32_MIN; + + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) + { + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) + { + src0 = read_imagef(input, coord_in); + coord_in.x += dilation_x; + maxVal = max(src0, maxVal); + } + } + + write_imagef(output, coord_out, maxVal.xxxx); +} + +__kernel void maxpool_U32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int width, + int height, + int stride_x, + int stride_y, + int pad_x, + int pad_y, + int kernel_dia_x, + int kernel_dia_y, + int dilation_x, + int dilation_y, + float inout_scale, + float inout_tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(gidx, gidy, gidz, 0); + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); + int4 coord_in = coord_out; + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); + + for(; pos_start.x < 0;) + { + pos_start.x += dilation_x; + } + for(; pos_start.y < 0;) + { + pos_start.y += dilation_y; + } + + pos_end = min(pos_end, (int2)(width, height)); + + uint4 src0, maxVal; + maxVal.x = 0; + + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) + { + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) + { + src0 = read_imageui(input, coord_in); + coord_in.x += dilation_x; + maxVal = max(src0, maxVal); + } + } + + float4 dst; + dst.x = maxVal.x * inout_scale + inout_tail; + + write_imagef(output, coord_out, dst.xxxx); +} + +__kernel void maxpool_F32toU32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int width, + int height, + int stride_x, + int stride_y, + int pad_x, + int pad_y, + int kernel_dia_x, + int kernel_dia_y, + int dilation_x, + int dilation_y, + float inout_scale, + float inout_tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(gidx, gidy, gidz, 0); + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); + int4 coord_in = coord_out; + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); + + for(; pos_start.x < 0;) + { + pos_start.x += dilation_x; + } + for(; pos_start.y < 0;) + { + pos_start.y += dilation_y; + } + + pos_end = min(pos_end, (int2)(width, height)); + + float4 src0, maxVal; + maxVal.x = VSI_FLOAT32_MIN; + + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) + { + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) + { + src0 = read_imagef(input, coord_in); + coord_in.x += dilation_x; + maxVal = max(src0, maxVal); + } + } + + uint4 dst; + dst.x = convert_uint_rte(maxVal.x * inout_scale + inout_tail); + + write_imageui(output, coord_out, dst.xxxx); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl index 62dd4d6..bb450d8 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl @@ -232,3 +232,66 @@ __kernel void moments_axis01_BF16toF32( write_imagef(output_vari, coord_out, vari); } } + +__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void moments_axis12_U8toF32( + image2d_array_t input, image2d_array_t output_mean, image2d_array_t output_vari, + int axis, int axis_num, int input_zp, float input_scale, + int width, int height, int chn, float dimRatio + ) +{ + int lidx = get_local_id(0); + int lidy = get_local_id(1); + int gidz = get_global_id(2); // width + + int4 coord = (int4)(gidz, lidx, lidy, 0); + uint4 data; + float sum = 0, sqr = 0; + float e2InScale = input_scale * input_scale; + + __local uint lcl_sumSqr[128]; + __local uint lcl_sumSqr1[32]; + + uint2 tmpSumSqr = 0; + for(coord.z = lidy; coord.z < chn; coord.z += 8) + { + for(coord.y = lidx; coord.y < height;) + { + data = read_imageui(input, coord); + coord.y += 8; + tmpSumSqr = tmpSumSqr + (uint2)(data.x, data.x * data.x); + } + //sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; + //sum += (tmpSum - height * input_zp) * input_scale; + } + int index = lidx + lidy * 8; + vstore2(tmpSumSqr, index, lcl_sumSqr); + barrier(CLK_LOCAL_MEM_FENCE); + if(index < 16) + { + uint4 val0 = vload4(index, lcl_sumSqr); + uint4 val1 = vload4(index, lcl_sumSqr + 64); + val0 += val1; + uint2 val2 = val0.xy + val0.zw; + vstore2(val2, index, lcl_sumSqr1); + } + barrier(CLK_LOCAL_MEM_FENCE); + if(index == 0) + { + uint4 val0 = 0; + for(int i = 0; i < 8; i++) + { + val0 += vload4(i, lcl_sumSqr1); + } + + float2 tmpVal = convert_float2(val0.xy + val0.zw); + sum = (tmpVal.x - height * chn * input_zp) * input_scale; + sqr = (tmpVal.y - 2 * input_zp * tmpVal.x + height * chn * input_zp * input_zp) * e2InScale; + float4 mean, vari; + mean.x = sum * dimRatio; + vari.x = sqr * dimRatio; + vari.x = vari.x - mean.x * mean.x; + + write_imagef(output_mean, coord.xwww, mean); + write_imagef(output_vari, coord.xwww, vari); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx index cd9511b..f1edbb0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine.vx @@ -5,157 +5,6 @@ _viv_uniform float4 matrix0; _viv_uniform float2 matrix1; _viv_uniform float4 matrix4; -__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D -( - __read_only image2d_array_t input, - __write_only image2d_array_t output, - float _m0, - float _m1, - float _m2, - float _m3, - float _m4, - float _m5 -) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); - - float4 coord_f = convert_float4(coord_in); - - coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; - - coord_in = convert_int4(coord_f); - - vxc_uchar16 dst; - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); - coord_f = coord_f.zwzw + matrix4; - coord_in = convert_int4(coord_f); - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); - coord_f = coord_f.zwzw + matrix4; - coord_in = convert_int4(coord_f); - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); - coord_f = coord_f.zwzw + matrix4; - coord_in = convert_int4(coord_f); - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); - - - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void custom_warp_affine_bilinear_U8toU8_2D -( - __read_only image2d_array_t input, - __write_only image2d_array_t output, - float _m0, - float _m1, - float _m2, - float _m3, - float _m4, - float _m5 -) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); - - float4 coord_f = convert_float4(coord_in); - - coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; - - coord_in = convert_int4(coord_f); - - vxc_uchar16 src0, src1, dst; - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); -#if (VX_VERSION==1) - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); -#else - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - src1.s0 = src0.s1; - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); -#endif - - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); -#if (VX_VERSION==1) - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); -#else - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - src1.s0 = src0.s1; - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); -#endif - - coord_f = coord_f.zwzw + matrix4; - coord_in = convert_int4(coord_f); - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); -#if (VX_VERSION==1) - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); -#else - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - src1.s0 = src0.s1; - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); -#endif - - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); -#if (VX_VERSION==1) - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); -#else - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - src1.s0 = src0.s1; - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); -#endif - - coord_f = coord_f.zwzw + matrix4; - coord_in = convert_int4(coord_f); - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); -#if (VX_VERSION==1) - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); -#else - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - src1.s0 = src0.s1; - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); -#endif - - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); -#if (VX_VERSION==1) - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); -#else - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - src1.s0 = src0.s1; - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); -#endif - - coord_f = coord_f.zwzw + matrix4; - coord_in = convert_int4(coord_f); - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); -#if (VX_VERSION==1) - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); -#else - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - src1.s0 = src0.s1; - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); -#endif - - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); -#if (VX_VERSION==1) - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); -#else - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); - src1.s0 = src0.s1; - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); -#endif - - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} __kernel void custom_warp_affine_nearest_neighbor_U8toU8 ( diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_2d.vx new file mode 100644 index 0000000..b3ce247 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_2d.vx @@ -0,0 +1,158 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable + +#include "cl_viv_vx_ext.h" + +_viv_uniform float4 matrix0; +_viv_uniform float2 matrix1; +_viv_uniform float4 matrix4; +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in = convert_int4(coord_f); + + vxc_uchar16 dst; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_bilinear_U8toU8_2D +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in = convert_int4(coord_f); + + vxc_uchar16 src0, src1, dst; + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_optional.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_optional.vx new file mode 100644 index 0000000..c3200e7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_optional.vx @@ -0,0 +1,341 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + +#define WARP_AFFINE(name) \ +__kernel void custom_warp_affine_##name \ +( \ + __read_only image2d_array_t input, \ + __read_only image2d_t matrix, \ + __write_only image2d_array_t output, \ + float _m0, \ + float _m1, \ + float _m2, \ + float _m3, \ + float _m4, \ + float _m5 \ +) \ + +#define GET_MATRIX_VALUE \ + float4 matrix0; \ + float2 matrix1; \ + float4 matrix4; \ + int2 coord_matrix = (int2)(0,0); \ + Image img1 = create_image_from_image2d(matrix, 4); \ + __global float* matrix_ptr = (__global float*)img1.ptr; \ + matrix0 = vload4(0, matrix_ptr); \ + matrix1 = vload2(2, matrix_ptr); \ + matrix4.x = matrix0.x; \ + matrix4.y = matrix0.y; \ + matrix4.z = matrix0.x * 2; \ + matrix4.w = matrix0.y * 2; \ + +WARP_AFFINE(nearest_neighbor_U8toU8_2D_optional_input) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + float4 coord_f = convert_float4(coord_in); + + GET_MATRIX_VALUE + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in = convert_int4(coord_f); + + vxc_uchar16 dst; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +WARP_AFFINE(bilinear_U8toU8_2D_optional_input) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + GET_MATRIX_VALUE + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in = convert_int4(coord_f); + + vxc_uchar16 src0, src1, dst; + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +WARP_AFFINE(nearest_neighbor_U8toU8_optional_input) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + float4 coord_f = convert_float4(coord_in); + + GET_MATRIX_VALUE + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + coord_in = convert_int4(coord_f); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 dst; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); + + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +WARP_AFFINE(bilinear_U8toU8_optional_input) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + float4 coord_f = convert_float4(coord_in); + + GET_MATRIX_VALUE + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in = convert_int4(coord_f); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 src0, src1, dst; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_f = coord_f.zwzw + matrix4; + coord_in = convert_int4(coord_f); + coord_input.xy = coord_in.xy; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb_optional.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb_optional.vx new file mode 100644 index 0000000..94f121f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb_optional.vx @@ -0,0 +1,333 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + +#define GET_MATRIX_VALUE \ + float4 matrix0; \ + float2 matrix1; \ + Image img1 = create_image_from_image2d(matrix, 4); \ + __global float* matrix_ptr = (__global float*)img1.ptr; \ + matrix0 = vload4(0, matrix_ptr); \ + matrix1 = vload2(2, matrix_ptr); \ + +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D_optional_input +( + __read_only image2d_array_t input, + __read_only image2d_t matrix, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + int2 coord_matrix = (int2)(0,0); + GET_MATRIX_VALUE + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in.x = floor(coord_f.x) * 3; + coord_in.y = floor(coord_f.y); + coord_in.z = floor(coord_f.z) * 3; + coord_in.w = floor(coord_f.w); + + vxc_uchar16 dst; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = coord_in.x + 1; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = coord_in.x + 1; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.z = coord_in.z + 1; + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + coord_in.z = coord_in.z + 1; + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D_optional_input +( + __read_only image2d_array_t input, + __read_only image2d_t matrix, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + int2 coord_matrix = (int2)(0,0); + GET_MATRIX_VALUE + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in.x = floor(coord_f.x) * 3; + coord_in.y = floor(coord_f.y); + coord_in.z = floor(coord_f.z) * 3; + coord_in.w = floor(coord_f.w); + + vxc_uchar16 src0, src1, src_0, src_1, dst; + VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + src0.x = src_0.s0; + src0.y = src_0.s3; + src1.x = src_1.s0; + src1.y = src_1.s3; + +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s1; + src0.y = src_0.s4; + src1.x = src_1.s1; + src1.y = src_1.s4; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s2; + src0.y = src_0.s5; + src1.x = src_1.s2; + src1.y = src_1.s5; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + src0.x = src_0.s0; + src0.y = src_0.s3; + src1.x = src_1.s0; + src1.y = src_1.s3; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s1; + src0.y = src_0.s4; + src1.x = src_1.s1; + src1.y = src_1.s4; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s2; + src0.y = src_0.s5; + src1.x = src_1.s2; + src1.y = src_1.s5; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_optional_input +( + __read_only image2d_array_t input, + __read_only image2d_t matrix, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + int2 coord_matrix = (int2)(0,0); + GET_MATRIX_VALUE + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in.x = floor(coord_f.x) * 3; + coord_in.y = floor(coord_f.y); + coord_in.z = floor(coord_f.z) * 3; + coord_in.w = floor(coord_f.w); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 dst; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_input.x = coord_input.x + 1; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_input.x = coord_input.x + 1; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_input.x = coord_input.x + 1; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + coord_input.x = coord_input.x + 1; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_bilinear_U8toU8_rgb_optional_input +( + __read_only image2d_array_t input, + __read_only image2d_t matrix, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + int2 coord_matrix = (int2)(0,0); + GET_MATRIX_VALUE + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in.x = floor(coord_f.x) * 3; + coord_in.y = floor(coord_f.y); + coord_in.z = floor(coord_f.z) * 3; + coord_in.w = floor(coord_f.w); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 src0, src1, src_0, src_1, dst; + VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + src0.x = src_0.s0; + src0.y = src_0.s3; + src1.x = src_1.s0; + src1.y = src_1.s3; + +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s1; + src0.y = src_0.s4; + src1.x = src_1.s1; + src1.y = src_1.s4; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s2; + src0.y = src_0.s5; + src1.x = src_1.s2; + src1.y = src_1.s5; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + src0.x = src_0.s0; + src0.y = src_0.s3; + src1.x = src_1.s0; + src1.y = src_1.s3; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s1; + src0.y = src_0.s4; + src1.x = src_1.s1; + src1.y = src_1.s4; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s2; + src0.y = src_0.s5; + src1.x = src_1.s2; + src1.y = src_1.s5; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx index 9c21fd1..b2009bf 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx @@ -1,6 +1,8 @@ #include "cl_viv_vx_ext.h" _viv_uniform int indices_num; +_viv_uniform int remainder; +_viv_uniform int width; _viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8; __kernel void gather_I8toI8_array( @@ -131,10 +133,12 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \ int axis_num \ ) \ { \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + \ + if (coord.x >= width) return; \ Image img0 = create_image_from_image2d(input0, 1); \ Image img1 = create_image_from_image2d(input1, 4); \ Image img2 = create_image_from_image2d(output, 1); \ - int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ uchar* index_ptr = get_image_ptr_from_coord(img1, coord.xz); \ __global int* index = (__global int*)index_ptr; \ int4 indices = vload4(0, index); \ @@ -146,10 +150,30 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \ __global data_type* data_ptr = (__global data_type*)input_ptr; \ __global write_type* out_ptr = (__global write_type*)output_ptr; \ indices = indices >= 0 ? indices : indices + axis_num; \ - src.s0 = data_ptr[indices.x]; \ - src.s1 = data_ptr[indices.y]; \ - src.s2 = data_ptr[indices.z]; \ - src.s3 = data_ptr[indices.w]; \ + if (coord.x + remainder < width) \ + { \ + src.s0 = data_ptr[indices.x]; \ + src.s1 = data_ptr[indices.y]; \ + src.s2 = data_ptr[indices.z]; \ + src.s3 = data_ptr[indices.w]; \ + } \ + else \ + { \ + __global data_type* out_ptr_remainder = (__global data_type*)output_ptr; \ + switch (remainder) \ + { \ + case 3: \ + out_ptr_remainder[2] = data_ptr[indices.z]; \ + case 2: \ + out_ptr_remainder[1] = data_ptr[indices.y]; \ + case 1: \ + out_ptr_remainder[0] = data_ptr[indices.x]; \ + break; \ + default: \ + break; \ + } \ + return; \ + } \ \ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniExtraCopyDpKeepinEvis_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation.vx index 7a12afa..1203d15 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation.vx @@ -3,6 +3,9 @@ #define logE (1.44269502f) #define twoLogE (2.88539004f) +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + float4 sigmoid(float4 x) { x *= -logE; @@ -104,3 +107,53 @@ GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, U8, hsigmoid, #undef UCHAR8 #undef SHORT8 #undef HALF8 + +#define GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(activater) \ +__kernel void grucell_activation_BF16_BF16_BF16_to_BF16_##activater \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output, \ + __write_only image2d_array_t hstate, \ + int gate_activation, \ + int candidate_activation \ + ) \ +{ \ + vxc_short8 src00, src10, src20, data0, data1; \ + float4 src01, src11, src21; \ + \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + VXC_ReadImage(src00, input0, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src10, input1, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src20, input2, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, src00, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src01, data0, 16); \ + VXC_DP2x8(data1, src10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src11, data1, 16); \ + VXC_DP2x8(data0, src20, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src21, data0, 16); \ + \ + src01 = src01 * tensorScale.xxxx - tensorZP.xxxx; \ + src01 = activater(src01); \ + \ + src11 = src11 * tensorScale.yyyy - tensorZP.yyyy; \ + src11 = tangentH(src11); \ + \ + src21 = src21 * tensorScale.zzzz - tensorZP.zzzz; \ + \ + src11 = src11 - src01 * src11; \ + src11 = src01 * src21 + src11; \ + \ + src11 = src11 * tensorScale.wwww + tensorZP.wwww; \ + _viv_asm(COPY, src00, src11, 16); \ + VXC_DP2x8(data0, src00, src00, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(sigmoid) +GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(hsigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_sma.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_sma.vx index 660bc23..d9e0cc1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_sma.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_sma.vx @@ -3,6 +3,11 @@ _viv_uniform VXC_512Bits uniA_Minus_B_2x8; _viv_uniform VXC_512Bits uniA_Times_B_2x8; _viv_uniform VXC_512Bits uniA_Plus_B_2x8; + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + __kernel void grucell_activation_sma_F16_F16_F16toF16 ( __read_only image2d_array_t input0, @@ -61,3 +66,101 @@ __kernel void grucell_activation_sma_F16_F16_F16toF16_2D VXC_WriteImage(h_status, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } +__kernel void grucell_activation_sma_BF16_BF16_BF16toBF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output, + __write_only image2d_array_t h_status + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + float4 src0, src00, src1, src11, src2, src22, minus, minus1, dst, dst1; + vxc_ushort8 vec0, vec1, vec2, data0, data1; + + VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_DP2x8(data0, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(data1, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src0, data0, 16); + _viv_asm(COPY, src00, data1, 16); + VXC_DP2x8(data0, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(data1, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src1, data0, 16); + _viv_asm(COPY, src11, data1, 16); + VXC_DP2x8(data0, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(data1, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src2, data0, 16); + _viv_asm(COPY, src22, data1, 16); + + minus = src0 - src1; + minus1 = src00 - src11; + + dst = minus * src2 + src1; + dst1 = minus1 * src22 + src11; + _viv_asm(COPY, vec0, dst, 16); + _viv_asm(COPY, vec1, dst1, 16); + VXC_DP2x8(data0, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + + VXC_WriteImage2DArray(output, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(h_status, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel void grucell_activation_sma_BF16_BF16_BF16toBF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output, + __write_only image2d_array_t h_status + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + float4 src0, src00, src1, src11, src2, src22, minus, minus1, dst, dst1; + vxc_ushort8 vec0, vec1, vec2, data0, data1; + + VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_DP2x8(data0, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(data1, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src0, data0, 16); + _viv_asm(COPY, src00, data1, 16); + VXC_DP2x8(data0, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(data1, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src1, data0, 16); + _viv_asm(COPY, src11, data1, 16); + VXC_DP2x8(data0, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(data1, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src2, data0, 16); + _viv_asm(COPY, src22, data1, 16); + + minus = src0 - src1; + minus1 = src00 - src11; + + dst = minus * src2 + src1; + dst1 = minus1 * src22 + src11; + _viv_asm(COPY, vec0, dst, 16); + _viv_asm(COPY, vec1, dst1, 16); + VXC_DP2x8(data0, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(h_status, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx index fce0623..6d117ed 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx @@ -3,6 +3,9 @@ #define logE (1.44269502f) #define twoLogE (logE * 2.0f) +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + float4 sigmoid_func(float4 x) { x *= -logE; @@ -128,3 +131,52 @@ GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8) GRUCELL_QNT_F16TO_QNT(U8, U8, HSIGMOID, hard_sigmoid, vxc_uchar8, vxc_uchar8) GRUCELL_QNT_F16TO_QNT(I8, I8, HSIGMOID, hard_sigmoid, vxc_char8, vxc_char8) GRUCELL_QNT_F16TO_QNT(I16, I16, HSIGMOID, hard_sigmoid, vxc_short8, vxc_short8) + +#define GRUCELL_BF16(act_name, act_func) \ +__kernel void grucell_activation_z_h_BF16_BF16toBF16_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out \ + ) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_short8 v0, v1, v2, v3, v4, v5, v6, data0, data1; \ + float4 src0, src1, src2, src3, src4, src5, src6; \ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, v4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src4, data1, 16); \ + VXC_DP2x8(data0, v5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src5, data0, 16); \ + VXC_DP2x8(data1, v6, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src6, data1, 16); \ + VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + \ + float4 h = src2 + src4; \ + float4 z = src5 + src6; \ + h = tanh_func(h); \ + z = act_func(z); \ + float4 result = (1 - z) * h + z * src3; \ + _viv_asm(COPY, v0, result, 16); \ + VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(hstate_out, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_BF16(SIGMOID, sigmoid_func) +GRUCELL_BF16(HSIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation_bf16.vx new file mode 100644 index 0000000..5ed938e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_cdnn_activation_bf16.vx @@ -0,0 +1,344 @@ +#include "cl_viv_vx_ext.h" + +#define logE (1.44269502f) +#define twoLogE (2.88539004f) + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + +__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_NC( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_r, + __read_only image2d_array_t input_z, + __read_only image2d_array_t input_c, + __read_only image2d_array_t recur_r, + __read_only image2d_array_t recur_z, + __read_only image2d_array_t recur_c, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, int candidate_activation, int batch_first) +{ + vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1; + float4 r0, r1, z0, z1, c0, c1, state; + float4 r, r2, r3, z, z2, z3, c, c2, c3; + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + r2 = read_imagef(bias_r, coord); + r3 = read_imagef(cond_r, coord); + z2 = read_imagef(bias_z, coord); + z3 = read_imagef(cond_z, coord); + c2 = read_imagef(bias_c, coord); + c3 = read_imagef(cond_c, coord); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, r0, data0, 16); + VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, r1, data1, 16); + VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, z0, data0, 16); + VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, z1, data1, 16); + VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, c0, data0, 16); + VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, c1, data1, 16); + VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, state, data0, 16); + r = r0 + r1 + r2 + r3; + z = z0 + z1 + z2 + z3; + + r = sigmoid(r); + z = sigmoid(z); + + c = c2 * r + c3; + c = c0 + c1 * r + c; + c = tangentH(c); + + state = z * (state - c) + c; + _viv_asm(COPY, s0, state, 16); + VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + + VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_CN( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_r, + __read_only image2d_array_t input_z, + __read_only image2d_array_t input_c, + __read_only image2d_array_t recur_r, + __read_only image2d_array_t recur_z, + __read_only image2d_array_t recur_c, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, int candidate_activation, int batch_first) +{ + vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1; + float4 r0, r1, z0, z1, c0, c1, state; + float4 r, r2, r3, z, z2, z3, c, c2, c3; + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + r2 = read_imagef(bias_r, coord.yx); + r3 = read_imagef(cond_r, coord.yx); + z2 = read_imagef(bias_z, coord.yx); + z3 = read_imagef(cond_z, coord.yx); + c2 = read_imagef(bias_c, coord.yx); + c3 = read_imagef(cond_c, coord.yx); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, r0, data0, 16); + VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, r1, data1, 16); + VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, z0, data0, 16); + VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, z1, data1, 16); + VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, c0, data0, 16); + VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, c1, data1, 16); + VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, state, data0, 16); + + r = r0 + r1 + r2.xxxx + r3.xxxx; + z = z0 + z1 + z2.xxxx + z3.xxxx; + + r = sigmoid(r); + z = sigmoid(z); + + c = c2.xxxx * r + c3.xxxx; + c = c0 + c1 * r + c; + c = tangentH(c); + state = z * (state - c) + c; + + _viv_asm(COPY, s0, state, 16); + VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + + VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_CN_FULL( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_r, + __read_only image2d_array_t input_z, + __read_only image2d_array_t input_c, + __read_only image2d_array_t recur_r, + __read_only image2d_array_t recur_z, + __read_only image2d_array_t recur_c, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, int candidate_activation, int batch_first) +{ + vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1; + float4 r0, r1, z0, z1, c0, c1, state; + float4 r, r2, r3, z, z2, z3, c, c2, c3; + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + r2 = read_imagef(bias_r, coord.yx); + r3 = read_imagef(cond_r, coord.yx); + z2 = read_imagef(bias_z, coord.yx); + z3 = read_imagef(cond_z, coord.yx); + c2 = read_imagef(bias_c, coord.yx); + c3 = read_imagef(cond_c, coord.yx); + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, r0, data0, 16); + VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, r1, data1, 16); + VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, z0, data0, 16); + VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, z1, data1, 16); + VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, c0, data0, 16); + VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, c1, data1, 16); + VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, state, data0, 16); + + r = r0 + r1 + r2.xxxx + r3.xxxx; + z = z0 + z1 + z2.xxxx + z3.xxxx; + + r = sigmoid(r); + z = sigmoid(z); + + c = c2.xxxx * r + c3.xxxx; + c = c0 + c1 * r + c; + c = tangentH(c); + state = z * (state - c) + c; + + _viv_asm(COPY, s0, state, 16); + VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + + +__kernel void grucell_activation_cdnn_BF16_BF16_BF16_to_BF16( + __read_only image2d_array_t prev_state, + __read_only image2d_array_t input_rzc, + __read_only image2d_array_t recur_rzc, + __read_only image2d_t bias_r, + __read_only image2d_t bias_z, + __read_only image2d_t bias_c, + __read_only image2d_t cond_r, + __read_only image2d_t cond_z, + __read_only image2d_t cond_c, + __write_only image2d_array_t output, + __write_only image2d_array_t hstate, + int gate_activation, int candidate_activation, int batch_first) +{ + vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1; + float4 r0, r1, z0, z1, c0, c1, state; + float4 r, r2, r3, z, z2, z3, c, c2, c3; + + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1) * 3, get_global_id(1)); + + VXC_ReadImage(s0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s2, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s3, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s4, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s5, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(s7, prev_state, coord.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + r2 = read_imagef(bias_r, coord.xy); + r3 = read_imagef(cond_r, coord.xy); + z2 = read_imagef(bias_z, coord.xy); + z3 = read_imagef(cond_z, coord.xy); + c2 = read_imagef(bias_c, coord.xy); + c3 = read_imagef(cond_c, coord.xy); + + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, r0, data0, 16); + VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, r1, data1, 16); + VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, z0, data0, 16); + VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, z1, data1, 16); + VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, c0, data0, 16); + VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, c1, data1, 16); + VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, state, data0, 16); + + r = r0 + r1 + r2 + r3; + z = z0 + z1 + z2 + z3; + + r = sigmoid(r); + z = sigmoid(z); + + c = c2 * r + c3; + c = c0 + c1 * r + c; + c = tangentH(c); + state = z * (state - c) + c; + + _viv_asm(COPY, s0, state, 16); + VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord.xy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(hstate, coord.xy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx index 1a037de..198ced0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx @@ -3,6 +3,9 @@ #define logE (1.44269502f) #define twoLogE (logE * 2.0f) +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + float4 sigmoid_func(float4 x) { x *= -logE; @@ -98,3 +101,39 @@ GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8) GRUCELL_QNT_F16TO_F16(U8, HSIGMOID, hard_sigmoid, vxc_uchar8) GRUCELL_QNT_F16TO_F16(I8, HSIGMOID, hard_sigmoid, vxc_char8) GRUCELL_QNT_F16TO_F16(I16, HSIGMOID, hard_sigmoid, vxc_short8) + +#define GRUCELL_BF16(act_name, act_func) \ +__kernel void grucell_h_times_activation_r_BF16_BF16toBF16_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t hstate_r_conv, \ + __write_only image2d_t output \ + ) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_short8 v0, v1, v2, v3, data0, data1; \ + float4 src0, src1, src2, src3; \ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src0, data0, 16); \ + VXC_DP2x8(data1, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data1, 16); \ + VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + \ + float4 r; \ + r = src0 + src1; \ + r = act_func(r); \ + float4 result = r * src3; \ + _viv_asm(COPY, v0, result, 16); \ + VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_BF16(SIGMOID, sigmoid_func) +GRUCELL_BF16(HSIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx index 8086f28..dbd265a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx @@ -3,6 +3,9 @@ #define logE (1.44269502f) #define twoLogE (logE * 2.0f) +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + float4 sigmoid_func(float4 x) { x *= -logE; @@ -150,3 +153,65 @@ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func, GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8) GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8) GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8) + +#define GRUCELL_BF16(act_name, act_func, rec_act_name, rec_act_func) \ +__kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_r_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out \ + ) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_short8 v0, v1, v2, v3, v4, v5, v6, data0, data1; \ + float4 src0, src1, src2, src3, src4, src5, src6; \ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src0, data0, 16); \ + VXC_DP2x8(data1, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data1, 16); \ + VXC_DP2x8(data0, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, v4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src4, data1, 16); \ + VXC_DP2x8(data0, v5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src5, data0, 16); \ + VXC_DP2x8(data1, v6, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src6, data1, 16); \ + VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + \ + float4 r; \ + r = src0 + src1; \ + r = rec_act_func(r); \ + float4 h = src4 + r * src2; \ + float4 z = src5 + src6; \ + h = act_func(h); \ + z = rec_act_func(z); \ + float4 result = (1 - z) * h + z * src3; \ + _viv_asm(COPY, v0, result, 16); \ + VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(hstate_out, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_BF16(TANH, tanh_func, SIGMOID, sigmoid_func) +GRUCELL_BF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_BF16.vx new file mode 100644 index 0000000..0a75533 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_BP_BF16.vx @@ -0,0 +1,124 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_BP_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_BP_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src0, data0, 16); \ + VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src10, data1, 16); \ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data0, 16); \ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src11, data1, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src12, data1, 16); \ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src13, data1, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + data_i_t = src0 + src10 + b0; \ + data_f_t = src1 + src11 + b1; \ + data_g_t = src2 + src12 + b2; \ + data_o_t = src3 + src13 + b3; \ + \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_BP_BF16(SIGMOID, sigmoid) +LSTMUNIT_BP_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_BF16.vx new file mode 100644 index 0000000..573c071 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_B_BF16.vx @@ -0,0 +1,126 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_B_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_B_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src0, data0, 16); \ + VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src10, data1, 16); \ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data0, 16); \ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src11, data1, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src12, data1, 16); \ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src13, data1, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + data_i_t = src0 + src10 + b0; \ + data_f_t = src1 + src11 + b1; \ + data_g_t = src2 + src12 + b2; \ + data_o_t = src3 + src13 + b3; \ + \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_B_BF16(SIGMOID, sigmoid) +LSTMUNIT_B_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_BF16.vx new file mode 100644 index 0000000..149d54c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CBP_BF16.vx @@ -0,0 +1,111 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_CBP_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_CBP_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data0, 16); \ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src11, data1, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src12, data1, 16); \ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src13, data1, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + data_f_t = src1 + src11 + b1; \ + data_g_t = src2 + src12 + b2; \ + data_o_t = src3 + src13 + b3; \ + \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CBP_BF16(SIGMOID, sigmoid) +LSTMUNIT_CBP_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_BF16.vx new file mode 100644 index 0000000..7e04936 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CB_BF16.vx @@ -0,0 +1,113 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_CB_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_CB_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 b0, b1, b2, b3; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data0, 16); \ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src11, data1, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src12, data1, 16); \ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src13, data1, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + data_f_t = src1 + src11 + b1; \ + data_g_t = src2 + src12 + b2; \ + data_o_t = src3 + src13 + b3; \ + \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CB_BF16(SIGMOID, sigmoid) +LSTMUNIT_CB_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CLP_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CLP_BF16.vx new file mode 100644 index 0000000..b6a91c2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CLP_BF16.vx @@ -0,0 +1,101 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +_viv_uniform VXC_512Bits uniExtractHalf4_4x4; +#define LSTMUNIT_CLP_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_CLP_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, b0, b1, b2; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_f, coord_in.xw); \ + b1 = read_imagef(bias_c, coord_in.xw); \ + b2 = read_imagef(bias_o, coord_in.xw); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_f_t, data0, 16); \ + VXC_DP2x8(data1, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_g_t, data1, 16); \ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_o_t, data0, 16); \ + VXC_DP2x8(data1, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data1, 16); \ + \ + data_f_t = data_f_t * w0 + b0; \ + data_g_t = data_g_t * w1 + b1; \ + data_o_t = data_o_t * w2 + b2; \ + \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_o_t = act_func(data_o_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CLP_BF16(SIGMOID, sigmoid) +LSTMUNIT_CLP_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CL_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CL_BF16.vx new file mode 100644 index 0000000..cf67244 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CL_BF16.vx @@ -0,0 +1,102 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_CL_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_CL_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, b0, b1, b2; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_f, coord_in.xw); \ + b1 = read_imagef(bias_c, coord_in.xw); \ + b2 = read_imagef(bias_o, coord_in.xw); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_f_t, data0, 16); \ + VXC_DP2x8(data1, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_g_t, data1, 16); \ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_o_t, data0, 16); \ + VXC_DP2x8(data1, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data1, 16); \ + \ + data_f_t = data_f_t * w0 + b0; \ + data_g_t = data_g_t * w1 + b1; \ + data_o_t = data_o_t * w2 + b2; \ + \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_o_t = act_func(data_o_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CL_BF16(SIGMOID, sigmoid) +LSTMUNIT_CL_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_BF16.vx new file mode 100644 index 0000000..05ba38e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CSP_BF16.vx @@ -0,0 +1,104 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_CSP_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_CSP_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data0, 16); \ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src11, data1, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src12, data1, 16); \ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src13, data1, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CSP_BF16(SIGMOID, sigmoid) +LSTMUNIT_CSP_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_BF16.vx new file mode 100644 index 0000000..b657e42 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_CS_BF16.vx @@ -0,0 +1,106 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_CS_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_CS_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data0, 16); \ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src11, data1, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src12, data1, 16); \ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src13, data1, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = 1.0 - data_f_t; \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_CS_BF16(SIGMOID, sigmoid) +LSTMUNIT_CS_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_LP_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_LP_BF16.vx new file mode 100644 index 0000000..61bd208 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_LP_BF16.vx @@ -0,0 +1,110 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_LP_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_LP_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wi, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_i_t, data0, 16); \ + VXC_DP2x8(data1, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_f_t, data1, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_g_t, data0, 16); \ + VXC_DP2x8(data1, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_o_t, data1, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + \ + data_i_t = data_i_t * w0 + b0; \ + data_f_t = data_f_t * w1 + b1; \ + data_g_t = data_g_t * w2 + b2; \ + data_o_t = data_o_t * w3 + b3; \ + \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_o_t = act_func(data_o_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_LP_BF16(SIGMOID, sigmoid) +LSTMUNIT_LP_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_L_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_L_BF16.vx new file mode 100644 index 0000000..de87624 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_L_BF16.vx @@ -0,0 +1,112 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_L_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_L_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_t bias_i, \ + __read_only image2d_t bias_f, \ + __read_only image2d_t bias_c, \ + __read_only image2d_t bias_o, \ + __read_only image2d_t layer_norm_wi, \ + __read_only image2d_t layer_norm_wf, \ + __read_only image2d_t layer_norm_wc, \ + __read_only image2d_t layer_norm_wo, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \ + b0 = read_imagef(bias_i, coord_in.xw); \ + b1 = read_imagef(bias_f, coord_in.xw); \ + b2 = read_imagef(bias_c, coord_in.xw); \ + b3 = read_imagef(bias_o, coord_in.xw); \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_i_t, data0, 16); \ + VXC_DP2x8(data1, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_f_t, data1, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_g_t, data0, 16); \ + VXC_DP2x8(data1, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_o_t, data1, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + \ + data_i_t = data_i_t * w0 + b0; \ + data_f_t = data_f_t * w1 + b1; \ + data_g_t = data_g_t * w2 + b2; \ + data_o_t = data_o_t * w3 + b3; \ + \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_o_t = act_func(data_o_t); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_L_BF16(SIGMOID, sigmoid) +LSTMUNIT_L_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_BF16.vx new file mode 100644 index 0000000..1283676 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_SP_BF16.vx @@ -0,0 +1,117 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_SP_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_SP_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1, data2, data3; \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + vxc_float4 src0, src1, src2, src3; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_float4 src10, src11, src12, src13; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src0, data0, 16); \ + VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src10, data1, 16); \ + VXC_DP2x8(data2, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data2, 16); \ + VXC_DP2x8(data3, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src11, data3, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src12, data1, 16); \ + VXC_DP2x8(data2, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data2, 16); \ + VXC_DP2x8(data3, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src13, data3, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, data0, data_c_t, 16); \ + VXC_DP2x8(data1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, data0, data_o_t, 16); \ + VXC_DP2x8(data1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_SP_BF16(SIGMOID, sigmoid) +LSTMUNIT_SP_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_BF16.vx new file mode 100644 index 0000000..8d60efe --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/lstmunit_activation_S_BF16.vx @@ -0,0 +1,118 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float logE; +_viv_uniform float twoLogE; +_viv_uniform float forget_bias; +float4 sigmoid(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tangentH(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform float4 clip_Min_F; +_viv_uniform float4 clip_Max_F; + +#define LSTMUNIT_S_BF16(act_name, act_func) \ +__kernel void lstmunit_activation_S_BF16toBF16_BF16_##act_name( \ + __read_only image2d_array_t input_i_conv, \ + __read_only image2d_array_t input_f_conv, \ + __read_only image2d_array_t input_c_conv, \ + __read_only image2d_array_t input_o_conv, \ + __read_only image2d_t cell_state_in, \ + __read_only image2d_array_t hstate_i_conv, \ + __read_only image2d_array_t hstate_f_conv, \ + __read_only image2d_array_t hstate_c_conv, \ + __read_only image2d_array_t hstate_o_conv, \ + __write_only image2d_array_t output, \ + __write_only image2d_t cell_state_out, \ + __write_only image2d_t h_state_out, \ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \ + vxc_float4 src0, src1, src2, src3, src4; \ + vxc_short8 vect10, vect11, vect12, vect13; \ + vxc_float4 src10, src11, src12, src13; \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src0, data0, 16); \ + VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src10, data1, 16); \ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src1, data0, 16); \ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src11, data1, 16); \ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src2, data0, 16); \ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src12, data1, 16); \ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src3, data0, 16); \ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, src13, data1, 16); \ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, data_c_t, data0, 16); \ + \ + data_i_t = src0 + src10; \ + data_f_t = src1 + src11; \ + data_g_t = src2 + src12; \ + data_o_t = src3 + src13; \ + \ + data_i_t = act_func(data_i_t); \ + data_f_t = act_func(data_f_t + forget_bias); \ + data_g_t = tangentH(data_g_t); \ + data_i_t = data_i_t * data_g_t; \ + data_c_t = data_c_t * data_f_t + data_i_t; \ + data_o_t = act_func(data_o_t); \ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \ + _viv_asm(COPY, vect0, data_c_t, 16); \ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + data_c_t = tangentH(data_c_t); \ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \ + _viv_asm(COPY, vect1, data_o_t, 16); \ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +LSTMUNIT_S_BF16(SIGMOID, sigmoid) +LSTMUNIT_S_BF16(HARD_SIGMOID, hard_sigmoid) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/maxpool.vx b/src/tim/vx/internal/src/libnnext/ops/vx/maxpool.vx new file mode 100644 index 0000000..969920a --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/maxpool.vx @@ -0,0 +1,283 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvF16toFp32_4x4; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform float inout_scale; +_viv_uniform float inout_tail; + +_viv_uniform int width; +_viv_uniform int height; + +#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val) \ +__kernel void maxpool_##in_name##to##out_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int stride_x, int stride_y, int pad_x, int pad_y, \ + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \ + int4 coord_in = coord_out; \ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \ + for(; pos_start.x < 0;) \ + { \ + pos_start.x += dilation_x; \ + } \ + for(; pos_start.y < 0;) \ + { \ + pos_start.y += dilation_y; \ + } \ + pos_end = min(pos_end, (int2)(width, height)); \ + \ + src_type src0; \ + dst_type maxVal; \ + maxVal.x = max_val; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr_a); \ + \ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \ + { \ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x += dilation_x; \ + VXC_VertMax3_Integer(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + \ + float4 fValTmp; \ + fValTmp.x = maxVal.x * inout_scale + inout_tail; \ + int4 i4Val = convert_int4_rte(fValTmp); \ + VXC_DP2x8(maxVal, i4Val, i4Val, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord_out, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +MAXPOOL_QINT(U8, U8, vxc_uchar8, vxc_uchar8, 0) +MAXPOOL_QINT(I8, I8, vxc_char8, vxc_char8, -128) +MAXPOOL_QINT(I16, I16, vxc_short8, vxc_short8, -32768) + +__kernel void maxpool_F16toF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int stride_x, int stride_y, int pad_x, int pad_y, + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(gidx, gidy, gidz, 0); + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); + int4 coord_in = coord_out; + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); + for(; pos_start.x < 0;) + { + pos_start.x += dilation_x; + } + for(; pos_start.y < 0;) + { + pos_start.y += dilation_y; + } + pos_end = min(pos_end, (int2)(width, height)); + + vxc_short8 data0; + vxc_half8 maxVal, src0; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr_a); + coord_in.xy = pos_start; + + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, maxVal, data0, 16); + + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) + { + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) + { + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x += dilation_x; + _viv_asm(COPY, src0, data0, 16); + VXC_VertMax3_Half(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } + } + _viv_asm(COPY, data0, maxVal, 16); + VXC_WriteImage2DArray(output, coord_out, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +#define MAXPOOL_F16_TO_QINT(out_name, dst_type) \ +__kernel void maxpool_F16to##out_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int stride_x, int stride_y, int pad_x, int pad_y, \ + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \ + int4 coord_in = coord_out; \ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \ + for(; pos_start.x < 0;) \ + { \ + pos_start.x += dilation_x; \ + } \ + for(; pos_start.y < 0;) \ + { \ + pos_start.y += dilation_y; \ + } \ + pos_end = min(pos_end, (int2)(width, height)); \ + \ + vxc_short8 data0; \ + vxc_half8 maxVal, src0; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr_a); \ + coord_in.xy = pos_start; \ + \ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, maxVal, data0, 16); \ + \ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \ + { \ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \ + { \ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x += dilation_x; \ + _viv_asm(COPY, src0, data0, 16); \ + VXC_VertMax3_Half(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + float4 fValTmp; \ + VXC_DP4x4(fValTmp, maxVal, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvF16toFp32_4x4); \ + fValTmp.x = fValTmp.x * inout_scale + inout_tail; \ + int4 i4Val = convert_int4_rte(fValTmp); \ + dst_type dst; \ + VXC_DP2x8(dst, i4Val, i4Val, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} + +MAXPOOL_F16_TO_QINT(U8, vxc_uchar8) +MAXPOOL_F16_TO_QINT(I8, vxc_char8) +MAXPOOL_F16_TO_QINT(I16, vxc_short8) + +#define MAXPOOL_QINT_TO_F16(in_name, src_type, max_val) \ +__kernel void maxpool_##in_name##toF16( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int stride_x, int stride_y, int pad_x, int pad_y, \ + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \ + int4 coord_in = coord_out; \ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \ + for(; pos_start.x < 0;) \ + { \ + pos_start.x += dilation_x; \ + } \ + for(; pos_start.y < 0;) \ + { \ + pos_start.y += dilation_y; \ + } \ + pos_end = min(pos_end, (int2)(width, height)); \ + \ + src_type src0, maxVal; \ + maxVal.x = max_val; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr_a); \ + \ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \ + { \ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x += dilation_x; \ + VXC_VertMax3_Integer(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + \ + float4 fValTmp; \ + fValTmp.x = maxVal.x * inout_scale + inout_tail; \ + half4 h4Val; \ + _viv_asm(CONV, h4Val, fValTmp); \ + vxc_short8 dst; \ + _viv_asm(COPY, dst, h4Val, 4); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ +} +MAXPOOL_QINT_TO_F16(U8, vxc_uchar8, 0) +MAXPOOL_QINT_TO_F16(I8, vxc_char8, -128) +MAXPOOL_QINT_TO_F16(I16, vxc_short8, -32768) + +__kernel void maxpool_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int stride_x, int stride_y, int pad_x, int pad_y, + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(gidx, gidy, gidz, 0); + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); + int4 coord_in = coord_out; + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); + for(; pos_start.x < 0;) + { + pos_start.x += dilation_x; + } + for(; pos_start.y < 0;) + { + pos_start.y += dilation_y; + } + pos_end = min(pos_end, (int2)(width, height)); + + vxc_short8 data0, val0; + float4 maxVal, src0; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr_a); + coord_in.xy = pos_start; + + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + + VXC_DP2x8(val0, data0, zero, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, maxVal, val0, 4); + + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) + { + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) + { + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x += dilation_x; + VXC_DP2x8(val0, data0, zero, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, src0, val0, 4); + maxVal = max(src0, maxVal); + } + } + _viv_asm(COPY, data0, maxVal, 16); + VXC_DP2x8(val0, data0, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord_out, val0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx index a20a579..affb8d9 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx @@ -17,6 +17,8 @@ _viv_uniform VXC_512Bits uniConvertNV12toR_4x4; _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; +_viv_uniform VXC_512Bits uniExtractYtoShortSub16_2x8; +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4; #define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ __kernel void pre_process_nv12_copy_##name \ @@ -57,14 +59,24 @@ __kernel void pre_process_nv12_copy_##name \ UV.s0123 = UV.s1032; \ } \ \ + vxc_short8 tmpY; \ vxc_char16 tmpUV; \ - short tmpVal = 128; \ + short tmpVal = 16; \ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_2x8); \ + tmpVal = 128; \ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \ \ float4 tmpDstB, tmpDstG, tmpDstR; \ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ \ conv_type result; \ dst_type dst0; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx index 2fe9ad6..c017e4e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx @@ -22,9 +22,11 @@ _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8; +_viv_uniform VXC_512Bits uniConvertYtoShortSub16_2x8; _viv_uniform VXC_512Bits uniCalculateYShift_2x8; _viv_uniform VXC_512Bits uniCalculateUVShift_2x8; +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4; #define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ __kernel void pre_process_nv12_scale_##name##_gq \ @@ -85,14 +87,24 @@ __kernel void pre_process_nv12_scale_##name##_gq \ VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ \ + vxc_short8 tmpY; \ vxc_char16 tmpUV; \ - short tmpVal = 128; \ + short tmpVal = 16; \ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \ + tmpVal = 128; \ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \ \ float4 tmpDstB, tmpDstG, tmpDstR; \ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ \ conv_type result; \ dst_type dst0; \ @@ -181,14 +193,24 @@ __kernel void pre_process_nv12_scale_##name \ UV.s01234567 = UV.s10325476; \ } \ \ + vxc_short8 tmpY; \ vxc_char16 tmpUV; \ - short tmpVal = 128; \ + short tmpVal = 16; \ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \ + tmpVal = 128; \ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \ \ float4 tmpDstB, tmpDstG, tmpDstR; \ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ \ conv_type result; \ dst_type dst0; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx index b308e65..b6515ce 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx @@ -118,7 +118,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ - int4 coord_out = coord; \ + int4 coord_out = coord.wwzw; \ coord_out.xyw += rgb_order.xyz; \ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \ rMean * r_scale * output_scale - output_zp, \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx index 0006e4a..7aabce2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx @@ -16,6 +16,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4; _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniExtractYUVtoShortSub_2x8; +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4; #define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \ __kernel void pre_process_yuv422_copy_##name \ @@ -54,11 +55,21 @@ __kernel void pre_process_yuv422_copy_##name \ } \ \ float4 tmpDstB, tmpDstG, tmpDstR; \ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \ vxc_short2 value = (vxc_short2)(128,16); \ VXC_DP2x8(tmpYUV, YUV, value, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYUVtoShortSub_2x8); \ - VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \ - VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \ - VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \ + VXC_DP4x4(DstB_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertYUV422toB_4x4); \ + VXC_DP4x4(DstG_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertYUV422toG_4x4); \ + VXC_DP4x4(DstR_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertYUV422toR_4x4); \ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ \ conv_type result; \ dst_type dst0; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx index 9fb80e5..22aeaaa 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx @@ -21,6 +21,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4; _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8; _viv_uniform VXC_512Bits uniExtractYtoShortSub16_4x4; +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4; #define uyvy422 1 @@ -70,8 +71,8 @@ __kernel void pre_process_yuv422_scale_##name \ } \ \ int4 coord_Y = (int4)(sx.x + y_offset, sy, 0, 0); \ - int4 coord_U = (int4)((sx.x >> 1) * 2 + u_offset, sy, 0, 0); \ - int4 coord_V = (int4)((sx.x >> 1) * 2 + v_offset, sy, 0, 0); \ + int4 coord_U = (int4)((sx.x >> 2) * 4 + u_offset, sy, 0, 0); \ + int4 coord_V = (int4)((sx.x >> 2) * 4 + v_offset, sy, 0, 0); \ \ VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ coord_Y.x = sx.y + y_offset; \ @@ -81,7 +82,7 @@ __kernel void pre_process_yuv422_scale_##name \ coord_Y.x = sx.w + y_offset; \ VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ \ - sx = (sx >> 1) * 2 + u_offset; \ + sx = (sx >> 2) * 4 + u_offset; \ VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ coord_U.x = sx.y; \ VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ @@ -105,9 +106,19 @@ __kernel void pre_process_yuv422_scale_##name \ VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ \ float4 tmpDstB, tmpDstG, tmpDstR; \ - VXC_DP4x4(tmpDstB, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \ - VXC_DP4x4(tmpDstG, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \ - VXC_DP4x4(tmpDstR, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertYUV422toB_4x4); \ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertYUV422toG_4x4); \ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\ + uniConvertYUV422toR_4x4); \ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \ \ conv_type result; \ dst_type dst0; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx index 752813e..296384c 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_2d.vx @@ -21,8 +21,8 @@ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ src0_type src0; \ src0_copy_type srcA; \ - src0_type src1; \ - src0_copy_type srcB; \ + src1_type src1; \ + src1_copy_type srcB; \ VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, srcA, src0, 16); \ VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx index f24a924..dec8254 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/relational_ops_3d.vx @@ -21,8 +21,8 @@ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8( \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ src0_type src0; \ src0_copy_type srcA; \ - src0_type src1; \ - src0_copy_type srcB; \ + src1_type src1; \ + src1_copy_type srcB; \ VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, srcA, src0, 16); \ VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx index 2284f49..2dda303 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx @@ -28,37 +28,40 @@ __kernel void scatter_nd_update_reset_##name0##to##name1( \ Image img1 = create_image_from_image2d(input_ref, size0); \ Image img2 = create_image_from_image2d(temp_ref, size1); \ Image img3 = create_image_from_image2d(temp_buf_int, 4); \ - __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \ - __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \ __global int* tmp_update_ptr = (__global int*)img3.ptr; \ - ptr0 tmpData = input_ptr[gidx]; \ - int4 zeros = (int4)(0); \ - int loc2 = gidx * 8; \ type0 src; \ type1 tmpDst; \ - ptr1 dst; \ vxc_ushort8 ms0; \ _viv_asm(COPY, ms0, multAndoutZP0, 16); \ - _viv_asm(COPY, src, tmpData, len0); \ - VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ - uniU8MulAndPostShift_0_Lo_2x8); \ - _viv_asm(COPY, dst, tmpDst, len1); \ - output_ptr[gidx] = dst; \ - vstore4(zeros, 0, tmp_update_ptr + loc2); \ - vstore4(zeros, 1, tmp_update_ptr + loc2); \ - if(gidx < res) \ + if(length > 0) \ { \ - __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \ - __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \ - ptr2 tmpData1 = input_ptr1[length + gidx]; \ + __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \ + __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \ + ptr0 tmpData = input_ptr[gidx]; \ + int4 zeros = (int4)(0); \ + int loc2 = gidx * 8; \ + ptr1 dst; \ + _viv_asm(COPY, src, tmpData, len0); \ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst, tmpDst, len1); \ + output_ptr[gidx] = dst; \ + vstore4(zeros, 0, tmp_update_ptr + loc2); \ + vstore4(zeros, 1, tmp_update_ptr + loc2); \ + } \ + __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \ + __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \ + for(int i = gidx; i < res; i += get_global_size(0)) \ + { \ + ptr2 tmpData1 = input_ptr1[length + i]; \ ptr3 dst1; \ dst1 ^= dst1; \ - tmp_update_ptr[length + gidx] = 0; \ + tmp_update_ptr[length + i] = 0; \ _viv_asm(COPY, src, tmpData1, 4); \ VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ uniU8MulAndPostShift_0_Lo_2x8); \ _viv_asm(COPY, dst1, tmpDst, len3); \ - output_ptr1[length + gidx] = dst1; \ + output_ptr1[length + i] = dst1; \ } \ } SCATTER_RESET(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1) @@ -246,14 +249,17 @@ __kernel void scatter_nd_update_copy_##src0_type( \ int gidx = get_global_id(0); \ Image img1 = create_image_from_image2d(temp_ref, element_size); \ Image img2 = create_image_from_image2d(output, element_size); \ - __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \ - __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \ - output_ptr[gidx] = input_ptr[gidx]; \ - if(gidx < res) \ + if(length > 0) \ { \ - __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \ - __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \ - output_ptr1[length + gidx] = input_ptr1[length + gidx]; \ + __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \ + __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \ + output_ptr[gidx] = input_ptr[gidx]; \ + } \ + __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \ + __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \ + for(int i = gidx; i < res; i += get_global_size(0)) \ + { \ + output_ptr1[length + i] = input_ptr1[length + i]; \ } \ } SCATTER_ND_UPDATE_COPY(U8, vxc_uchar8, 1, uchar) diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 5421a5a..dd10737 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -5997,157 +5997,6 @@ static const char custom_warp_affine_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_ _viv_uniform float4 matrix0;\n\ _viv_uniform float2 matrix1;\n\ _viv_uniform float4 matrix4;\n\ -__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D\n\ -(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ - float _m0,\n\ - float _m1,\n\ - float _m2,\n\ - float _m3,\n\ - float _m4,\n\ - float _m5\n\ -)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ -\n\ - float4 coord_f = convert_float4(coord_in);\n\ -\n\ - coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ -\n\ - coord_in = convert_int4(coord_f);\n\ -\n\ - vxc_uchar16 dst;\n\ - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_f = coord_f.zwzw + matrix4;\n\ - coord_in = convert_int4(coord_f);\n\ - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_f = coord_f.zwzw + matrix4;\n\ - coord_in = convert_int4(coord_f);\n\ - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_f = coord_f.zwzw + matrix4;\n\ - coord_in = convert_int4(coord_f);\n\ - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ -\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void custom_warp_affine_bilinear_U8toU8_2D\n\ -(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ - float _m0,\n\ - float _m1,\n\ - float _m2,\n\ - float _m3,\n\ - float _m4,\n\ - float _m5\n\ -)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ -\n\ - float4 coord_f = convert_float4(coord_in);\n\ -\n\ - coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ -\n\ - coord_in = convert_int4(coord_f);\n\ -\n\ - vxc_uchar16 src0, src1, dst;\n\ - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - coord_f = coord_f.zwzw + matrix4;\n\ - coord_in = convert_int4(coord_f);\n\ - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - coord_f = coord_f.zwzw + matrix4;\n\ - coord_in = convert_int4(coord_f);\n\ - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - coord_f = coord_f.zwzw + matrix4;\n\ - coord_in = convert_int4(coord_f);\n\ - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ \n\ __kernel void custom_warp_affine_nearest_neighbor_U8toU8\n\ (\n\ @@ -6345,6 +6194,509 @@ __kernel void custom_warp_affine_bilinear_U8toU8\n\ }\n\ "; /* end of custom_warp_affine_vx*/ +static const char custom_warp_affine_2d_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float4 matrix0;\n\ +_viv_uniform float2 matrix1;\n\ +_viv_uniform float4 matrix4;\n\ +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_affine_bilinear_U8toU8_2D\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of custom_warp_affine_2d_vx*/ + +static const char custom_warp_affine_optional_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define WARP_AFFINE(name) \\\n\ +__kernel void custom_warp_affine_##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t matrix, \\\n\ + __write_only image2d_array_t output, \\\n\ + float _m0, \\\n\ + float _m1, \\\n\ + float _m2, \\\n\ + float _m3, \\\n\ + float _m4, \\\n\ + float _m5 \\\n\ +) \\\n\ +\n\ +#define GET_MATRIX_VALUE \\\n\ + float4 matrix0; \\\n\ + float2 matrix1; \\\n\ + float4 matrix4; \\\n\ + int2 coord_matrix = (int2)(0,0); \\\n\ + Image img1 = create_image_from_image2d(matrix, 4); \\\n\ + __global float* matrix_ptr = (__global float*)img1.ptr; \\\n\ + matrix0 = vload4(0, matrix_ptr); \\\n\ + matrix1 = vload2(2, matrix_ptr); \\\n\ + matrix4.x = matrix0.x; \\\n\ + matrix4.y = matrix0.y; \\\n\ + matrix4.z = matrix0.x * 2; \\\n\ + matrix4.w = matrix0.y * 2; \\\n\ +\n\ +WARP_AFFINE(nearest_neighbor_U8toU8_2D_optional_input)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + GET_MATRIX_VALUE\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +WARP_AFFINE(bilinear_U8toU8_2D_optional_input)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + GET_MATRIX_VALUE\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +WARP_AFFINE(nearest_neighbor_U8toU8_optional_input)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + GET_MATRIX_VALUE\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +WARP_AFFINE(bilinear_U8toU8_optional_input)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + GET_MATRIX_VALUE\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of custom_warp_affine_optional_vx*/ + static const char custom_warp_affine_rgb_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ \n\ #include \"cl_viv_vx_ext.h\"\n\ @@ -6662,6 +7014,340 @@ __kernel void custom_warp_affine_bilinear_U8toU8_rgb\n\ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of custom_warp_affine_rgb_vx*/ +static const char custom_warp_affine_rgb_optional_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define GET_MATRIX_VALUE \\\n\ + float4 matrix0; \\\n\ + float2 matrix1; \\\n\ + Image img1 = create_image_from_image2d(matrix, 4); \\\n\ + __global float* matrix_ptr = (__global float*)img1.ptr; \\\n\ + matrix0 = vload4(0, matrix_ptr); \\\n\ + matrix1 = vload2(2, matrix_ptr); \\\n\ +\n\ +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D_optional_input\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t matrix,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ + int2 coord_matrix = (int2)(0,0);\n\ + GET_MATRIX_VALUE\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in.x = floor(coord_f.x) * 3;\n\ + coord_in.y = floor(coord_f.y);\n\ + coord_in.z = floor(coord_f.z) * 3;\n\ + coord_in.w = floor(coord_f.w);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = coord_in.x + 1;\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = coord_in.x + 1;\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z = coord_in.z + 1;\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z = coord_in.z + 1;\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D_optional_input\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t matrix,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ + int2 coord_matrix = (int2)(0,0);\n\ + GET_MATRIX_VALUE\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in.x = floor(coord_f.x) * 3;\n\ + coord_in.y = floor(coord_f.y);\n\ + coord_in.z = floor(coord_f.z) * 3;\n\ + coord_in.w = floor(coord_f.w);\n\ +\n\ + vxc_uchar16 src0, src1, src_0, src_1, dst;\n\ + VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + src0.x = src_0.s0;\n\ + src0.y = src_0.s3;\n\ + src1.x = src_1.s0;\n\ + src1.y = src_1.s3;\n\ +\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s1;\n\ + src0.y = src_0.s4;\n\ + src1.x = src_1.s1;\n\ + src1.y = src_1.s4;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s2;\n\ + src0.y = src_0.s5;\n\ + src1.x = src_1.s2;\n\ + src1.y = src_1.s5;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + src0.x = src_0.s0;\n\ + src0.y = src_0.s3;\n\ + src1.x = src_1.s0;\n\ + src1.y = src_1.s3;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s1;\n\ + src0.y = src_0.s4;\n\ + src1.x = src_1.s1;\n\ + src1.y = src_1.s4;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s2;\n\ + src0.y = src_0.s5;\n\ + src1.x = src_1.s2;\n\ + src1.y = src_1.s5;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_optional_input\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t matrix,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ + int2 coord_matrix = (int2)(0,0);\n\ + GET_MATRIX_VALUE\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in.x = floor(coord_f.x) * 3;\n\ + coord_in.y = floor(coord_f.y);\n\ + coord_in.z = floor(coord_f.z) * 3;\n\ + coord_in.w = floor(coord_f.w);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.x = coord_input.x + 1;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.x = coord_input.x + 1;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.x = coord_input.x + 1;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.x = coord_input.x + 1;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_affine_bilinear_U8toU8_rgb_optional_input\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t matrix,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ + int2 coord_matrix = (int2)(0,0);\n\ + GET_MATRIX_VALUE\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in.x = floor(coord_f.x) * 3;\n\ + coord_in.y = floor(coord_f.y);\n\ + coord_in.z = floor(coord_f.z) * 3;\n\ + coord_in.w = floor(coord_f.w);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 src0, src1, src_0, src_1, dst;\n\ + VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + src0.x = src_0.s0;\n\ + src0.y = src_0.s3;\n\ + src1.x = src_1.s0;\n\ + src1.y = src_1.s3;\n\ +\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s1;\n\ + src0.y = src_0.s4;\n\ + src1.x = src_1.s1;\n\ + src1.y = src_1.s4;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s2;\n\ + src0.y = src_0.s5;\n\ + src1.x = src_1.s2;\n\ + src1.y = src_1.s5;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + src0.x = src_0.s0;\n\ + src0.y = src_0.s3;\n\ + src1.x = src_1.s0;\n\ + src1.y = src_1.s3;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s1;\n\ + src0.y = src_0.s4;\n\ + src1.x = src_1.s1;\n\ + src1.y = src_1.s4;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s2;\n\ + src0.y = src_0.s5;\n\ + src1.x = src_1.s2;\n\ + src1.y = src_1.s5;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of custom_warp_affine_rgb_optional_vx*/ + static const char custom_warp_perspective_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ \n\ #include \"cl_viv_vx_ext.h\"\n\ @@ -9783,6 +10469,8 @@ __kernel void gather_F16toF16_axis0(\n\ static const char gather_array_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int indices_num;\n\ +_viv_uniform int remainder;\n\ +_viv_uniform int width;\n\ _viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;\n\ \n\ __kernel void gather_I8toI8_array(\n\ @@ -9913,10 +10601,12 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \\\n\ int axis_num \\\n\ ) \\\n\ { \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + \\\n\ + if (coord.x >= width) return; \\\n\ Image img0 = create_image_from_image2d(input0, 1); \\\n\ Image img1 = create_image_from_image2d(input1, 4); \\\n\ Image img2 = create_image_from_image2d(output, 1); \\\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ uchar* index_ptr = get_image_ptr_from_coord(img1, coord.xz); \\\n\ __global int* index = (__global int*)index_ptr; \\\n\ int4 indices = vload4(0, index); \\\n\ @@ -9928,10 +10618,30 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \\\n\ __global data_type* data_ptr = (__global data_type*)input_ptr; \\\n\ __global write_type* out_ptr = (__global write_type*)output_ptr; \\\n\ indices = indices >= 0 ? indices : indices + axis_num; \\\n\ - src.s0 = data_ptr[indices.x]; \\\n\ - src.s1 = data_ptr[indices.y]; \\\n\ - src.s2 = data_ptr[indices.z]; \\\n\ - src.s3 = data_ptr[indices.w]; \\\n\ + if (coord.x + remainder < width) \\\n\ + { \\\n\ + src.s0 = data_ptr[indices.x]; \\\n\ + src.s1 = data_ptr[indices.y]; \\\n\ + src.s2 = data_ptr[indices.z]; \\\n\ + src.s3 = data_ptr[indices.w]; \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + __global data_type* out_ptr_remainder = (__global data_type*)output_ptr; \\\n\ + switch (remainder) \\\n\ + { \\\n\ + case 3: \\\n\ + out_ptr_remainder[2] = data_ptr[indices.z]; \\\n\ + case 2: \\\n\ + out_ptr_remainder[1] = data_ptr[indices.y]; \\\n\ + case 1: \\\n\ + out_ptr_remainder[0] = data_ptr[indices.x]; \\\n\ + break; \\\n\ + default: \\\n\ + break; \\\n\ + } \\\n\ + return; \\\n\ + } \\\n\ \\\n\ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ uniExtraCopyDpKeepinEvis_2x8); \\\n\ @@ -12849,6 +13559,9 @@ static const char grucell_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ #define logE (1.44269502f)\n\ #define twoLogE (2.88539004f)\n\ \n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ float4 sigmoid(float4 x)\n\ {\n\ x *= -logE;\n\ @@ -12950,6 +13663,56 @@ GRUCELL_ACTIVATION_SIGMOID_TANH(F16, F16, F16, U8, hsigmoid,\n\ #undef UCHAR8\n\ #undef SHORT8\n\ #undef HALF8\n\ +\n\ +#define GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(activater) \\\n\ +__kernel void grucell_activation_BF16_BF16_BF16_to_BF16_##activater \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __read_only image2d_array_t input2, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_array_t hstate, \\\n\ + int gate_activation, \\\n\ + int candidate_activation \\\n\ + ) \\\n\ +{ \\\n\ + vxc_short8 src00, src10, src20, data0, data1; \\\n\ + float4 src01, src11, src21; \\\n\ + \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + VXC_ReadImage(src00, input0, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src10, input1, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src20, input2, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, src00, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src01, data0, 16); \\\n\ + VXC_DP2x8(data1, src10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src11, data1, 16); \\\n\ + VXC_DP2x8(data0, src20, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src21, data0, 16); \\\n\ + \\\n\ + src01 = src01 * tensorScale.xxxx - tensorZP.xxxx; \\\n\ + src01 = activater(src01); \\\n\ + \\\n\ + src11 = src11 * tensorScale.yyyy - tensorZP.yyyy; \\\n\ + src11 = tangentH(src11); \\\n\ + \\\n\ + src21 = src21 * tensorScale.zzzz - tensorZP.zzzz; \\\n\ + \\\n\ + src11 = src11 - src01 * src11; \\\n\ + src11 = src01 * src21 + src11; \\\n\ + \\\n\ + src11 = src11 * tensorScale.wwww + tensorZP.wwww; \\\n\ + _viv_asm(COPY, src00, src11, 16); \\\n\ + VXC_DP2x8(data0, src00, src00, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(sigmoid)\n\ +GRUCELL_ACTIVATION_SIGMOID_TANH_BF16(hsigmoid)\n\ "; /* end of grucell_activation_vx*/ static const char grucell_activation_sma_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -12957,6 +13720,11 @@ static const char grucell_activation_sma_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ _viv_uniform VXC_512Bits uniA_Minus_B_2x8;\n\ _viv_uniform VXC_512Bits uniA_Times_B_2x8;\n\ _viv_uniform VXC_512Bits uniA_Plus_B_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ __kernel void grucell_activation_sma_F16_F16_F16toF16\n\ (\n\ __read_only image2d_array_t input0,\n\ @@ -13015,6 +13783,104 @@ __kernel void grucell_activation_sma_F16_F16_F16toF16_2D\n\ VXC_WriteImage(h_status, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ \n\ +__kernel void grucell_activation_sma_BF16_BF16_BF16toBF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_array_t input2,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t h_status\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + float4 src0, src00, src1, src11, src2, src22, minus, minus1, dst, dst1;\n\ + vxc_ushort8 vec0, vec1, vec2, data0, data1;\n\ +\n\ + VXC_ReadImage2DArray(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_DP2x8(data0, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(data1, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src0, data0, 16);\n\ + _viv_asm(COPY, src00, data1, 16);\n\ + VXC_DP2x8(data0, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(data1, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src1, data0, 16);\n\ + _viv_asm(COPY, src11, data1, 16);\n\ + VXC_DP2x8(data0, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(data1, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src2, data0, 16);\n\ + _viv_asm(COPY, src22, data1, 16);\n\ +\n\ + minus = src0 - src1;\n\ + minus1 = src00 - src11;\n\ +\n\ + dst = minus * src2 + src1;\n\ + dst1 = minus1 * src22 + src11;\n\ + _viv_asm(COPY, vec0, dst, 16);\n\ + _viv_asm(COPY, vec1, dst1, 16);\n\ + VXC_DP2x8(data0, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(h_status, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void grucell_activation_sma_BF16_BF16_BF16toBF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_array_t input2,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t h_status\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + float4 src0, src00, src1, src11, src2, src22, minus, minus1, dst, dst1;\n\ + vxc_ushort8 vec0, vec1, vec2, data0, data1;\n\ +\n\ + VXC_ReadImage(vec0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(vec1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(vec2, input2, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_DP2x8(data0, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(data1, vec0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src0, data0, 16);\n\ + _viv_asm(COPY, src00, data1, 16);\n\ + VXC_DP2x8(data0, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(data1, vec1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src1, data0, 16);\n\ + _viv_asm(COPY, src11, data1, 16);\n\ + VXC_DP2x8(data0, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(data1, vec2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src2, data0, 16);\n\ + _viv_asm(COPY, src22, data1, 16);\n\ +\n\ + minus = src0 - src1;\n\ + minus1 = src00 - src11;\n\ +\n\ + dst = minus * src2 + src1;\n\ + dst1 = minus1 * src22 + src11;\n\ + _viv_asm(COPY, vec0, dst, 16);\n\ + _viv_asm(COPY, vec1, dst1, 16);\n\ + VXC_DP2x8(data0, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(h_status, coord, data0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ "; /* end of grucell_activation_sma_vx*/ static const char grucell_activation_z_h_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -13022,6 +13888,9 @@ static const char grucell_activation_z_h_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ #define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ \n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ float4 sigmoid_func(float4 x)\n\ {\n\ x *= -logE;\n\ @@ -13147,6 +14016,55 @@ GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\ GRUCELL_QNT_F16TO_QNT(U8, U8, HSIGMOID, hard_sigmoid, vxc_uchar8, vxc_uchar8)\n\ GRUCELL_QNT_F16TO_QNT(I8, I8, HSIGMOID, hard_sigmoid, vxc_char8, vxc_char8)\n\ GRUCELL_QNT_F16TO_QNT(I16, I16, HSIGMOID, hard_sigmoid, vxc_short8, vxc_short8)\n\ +\n\ +#define GRUCELL_BF16(act_name, act_func) \\\n\ +__kernel void grucell_activation_z_h_BF16_BF16toBF16_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_short8 v0, v1, v2, v3, v4, v5, v6, data0, data1; \\\n\ + float4 src0, src1, src2, src3, src4, src5, src6; \\\n\ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, v4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src4, data1, 16); \\\n\ + VXC_DP2x8(data0, v5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src5, data0, 16); \\\n\ + VXC_DP2x8(data1, v6, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src6, data1, 16); \\\n\ + VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + \\\n\ + float4 h = src2 + src4; \\\n\ + float4 z = src5 + src6; \\\n\ + h = tanh_func(h); \\\n\ + z = act_func(z); \\\n\ + float4 result = (1 - z) * h + z * src3; \\\n\ + _viv_asm(COPY, v0, result, 16); \\\n\ + VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(hstate_out, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_BF16(SIGMOID, sigmoid_func)\n\ +GRUCELL_BF16(HSIGMOID, hard_sigmoid)\n\ "; /* end of grucell_activation_z_h_vx*/ static const char grucell_cdnn_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -13541,6 +14459,352 @@ __kernel void grucell_activation_cdnn_F16_F16_F16_to_F16\n\ \n\ "; /* end of grucell_cdnn_activation_vx*/ +static const char grucell_cdnn_activation_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (2.88539004f)\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_NC(\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_r,\n\ + __read_only image2d_array_t input_z,\n\ + __read_only image2d_array_t input_c,\n\ + __read_only image2d_array_t recur_r,\n\ + __read_only image2d_array_t recur_z,\n\ + __read_only image2d_array_t recur_c,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation, int candidate_activation, int batch_first)\n\ +{\n\ + vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;\n\ + float4 r0, r1, z0, z1, c0, c1, state;\n\ + float4 r, r2, r3, z, z2, z3, c, c2, c3;\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + r2 = read_imagef(bias_r, coord);\n\ + r3 = read_imagef(cond_r, coord);\n\ + z2 = read_imagef(bias_z, coord);\n\ + z3 = read_imagef(cond_z, coord);\n\ + c2 = read_imagef(bias_c, coord);\n\ + c3 = read_imagef(cond_c, coord);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, r0, data0, 16);\n\ + VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, r1, data1, 16);\n\ + VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, z0, data0, 16);\n\ + VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, z1, data1, 16);\n\ + VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, c0, data0, 16);\n\ + VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, c1, data1, 16);\n\ + VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, state, data0, 16);\n\ + r = r0 + r1 + r2 + r3;\n\ + z = z0 + z1 + z2 + z3;\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2 * r + c3;\n\ + c = c0 + c1 * r + c;\n\ + c = tangentH(c);\n\ +\n\ + state = z * (state - c) + c;\n\ + _viv_asm(COPY, s0, state, 16);\n\ + VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_CN(\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_r,\n\ + __read_only image2d_array_t input_z,\n\ + __read_only image2d_array_t input_c,\n\ + __read_only image2d_array_t recur_r,\n\ + __read_only image2d_array_t recur_z,\n\ + __read_only image2d_array_t recur_c,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation, int candidate_activation, int batch_first)\n\ +{\n\ + vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;\n\ + float4 r0, r1, z0, z1, c0, c1, state;\n\ + float4 r, r2, r3, z, z2, z3, c, c2, c3;\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord.yx, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + r2 = read_imagef(bias_r, coord.yx);\n\ + r3 = read_imagef(cond_r, coord.yx);\n\ + z2 = read_imagef(bias_z, coord.yx);\n\ + z3 = read_imagef(cond_z, coord.yx);\n\ + c2 = read_imagef(bias_c, coord.yx);\n\ + c3 = read_imagef(cond_c, coord.yx);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, r0, data0, 16);\n\ + VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, r1, data1, 16);\n\ + VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, z0, data0, 16);\n\ + VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, z1, data1, 16);\n\ + VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, c0, data0, 16);\n\ + VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, c1, data1, 16);\n\ + VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, state, data0, 16);\n\ +\n\ + r = r0 + r1 + r2.xxxx + r3.xxxx;\n\ + z = z0 + z1 + z2.xxxx + z3.xxxx;\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2.xxxx * r + c3.xxxx;\n\ + c = c0 + c1 * r + c;\n\ + c = tangentH(c);\n\ + state = z * (state - c) + c;\n\ +\n\ + _viv_asm(COPY, s0, state, 16);\n\ + VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, data0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.yx, data0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void grucell_activation_cdnn_sep_BF16_BF16_BF16_to_BF16_CN_FULL(\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_r,\n\ + __read_only image2d_array_t input_z,\n\ + __read_only image2d_array_t input_c,\n\ + __read_only image2d_array_t recur_r,\n\ + __read_only image2d_array_t recur_z,\n\ + __read_only image2d_array_t recur_c,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation, int candidate_activation, int batch_first)\n\ +{\n\ + vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;\n\ + float4 r0, r1, z0, z1, c0, c1, state;\n\ + float4 r, r2, r3, z, z2, z3, c, c2, c3;\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + VXC_ReadImage(s0, input_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s1, recur_r, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s2, input_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s3, recur_z, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s4, input_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s5, recur_c, coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + r2 = read_imagef(bias_r, coord.yx);\n\ + r3 = read_imagef(cond_r, coord.yx);\n\ + z2 = read_imagef(bias_z, coord.yx);\n\ + z3 = read_imagef(cond_z, coord.yx);\n\ + c2 = read_imagef(bias_c, coord.yx);\n\ + c3 = read_imagef(cond_c, coord.yx);\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, r0, data0, 16);\n\ + VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, r1, data1, 16);\n\ + VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, z0, data0, 16);\n\ + VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, z1, data1, 16);\n\ + VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, c0, data0, 16);\n\ + VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, c1, data1, 16);\n\ + VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, state, data0, 16);\n\ +\n\ + r = r0 + r1 + r2.xxxx + r3.xxxx;\n\ + z = z0 + z1 + z2.xxxx + z3.xxxx;\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2.xxxx * r + c3.xxxx;\n\ + c = c0 + c1 * r + c;\n\ + c = tangentH(c);\n\ + state = z * (state - c) + c;\n\ +\n\ + _viv_asm(COPY, s0, state, 16);\n\ + VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +\n\ +__kernel void grucell_activation_cdnn_BF16_BF16_BF16_to_BF16(\n\ + __read_only image2d_array_t prev_state,\n\ + __read_only image2d_array_t input_rzc,\n\ + __read_only image2d_array_t recur_rzc,\n\ + __read_only image2d_t bias_r,\n\ + __read_only image2d_t bias_z,\n\ + __read_only image2d_t bias_c,\n\ + __read_only image2d_t cond_r,\n\ + __read_only image2d_t cond_z,\n\ + __read_only image2d_t cond_c,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t hstate,\n\ + int gate_activation, int candidate_activation, int batch_first)\n\ +{\n\ + vxc_ushort8 s0, s1, s2, s3, s4, s5, s7, data0, data1;\n\ + float4 r0, r1, z0, z1, c0, c1, state;\n\ + float4 r, r2, r3, z, z2, z3, c, c2, c3;\n\ +\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1) * 3, get_global_id(1));\n\ +\n\ + VXC_ReadImage(s0, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s1, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s2, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s3, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s4, input_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s5, recur_rzc, coord.xz, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(s7, prev_state, coord.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + r2 = read_imagef(bias_r, coord.xy);\n\ + r3 = read_imagef(cond_r, coord.xy);\n\ + z2 = read_imagef(bias_z, coord.xy);\n\ + z3 = read_imagef(cond_z, coord.xy);\n\ + c2 = read_imagef(bias_c, coord.xy);\n\ + c3 = read_imagef(cond_c, coord.xy);\n\ +\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_DP2x8(data0, s0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, r0, data0, 16);\n\ + VXC_DP2x8(data1, s1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, r1, data1, 16);\n\ + VXC_DP2x8(data0, s2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, z0, data0, 16);\n\ + VXC_DP2x8(data1, s3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, z1, data1, 16);\n\ + VXC_DP2x8(data0, s4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, c0, data0, 16);\n\ + VXC_DP2x8(data1, s5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, c1, data1, 16);\n\ + VXC_DP2x8(data0, s7, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, state, data0, 16);\n\ +\n\ + r = r0 + r1 + r2 + r3;\n\ + z = z0 + z1 + z2 + z3;\n\ +\n\ + r = sigmoid(r);\n\ + z = sigmoid(z);\n\ +\n\ + c = c2 * r + c3;\n\ + c = c0 + c1 * r + c;\n\ + c = tangentH(c);\n\ + state = z * (state - c) + c;\n\ +\n\ + _viv_asm(COPY, s0, state, 16);\n\ + VXC_DP2x8(data0, s0, s0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord.xy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(hstate, coord.xy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of grucell_cdnn_activation_bf16_vx*/ + static const char grucell_cdnn_activation_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ #define logE (1.44269502f)\n\ @@ -13944,6 +15208,9 @@ static const char grucell_h_times_activation_r_vx[] = "#include \"cl_viv_vx_ext. #define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ \n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ float4 sigmoid_func(float4 x)\n\ {\n\ x *= -logE;\n\ @@ -14039,6 +15306,42 @@ GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)\n\ GRUCELL_QNT_F16TO_F16(U8, HSIGMOID, hard_sigmoid, vxc_uchar8)\n\ GRUCELL_QNT_F16TO_F16(I8, HSIGMOID, hard_sigmoid, vxc_char8)\n\ GRUCELL_QNT_F16TO_F16(I16, HSIGMOID, hard_sigmoid, vxc_short8)\n\ +\n\ +#define GRUCELL_BF16(act_name, act_func) \\\n\ +__kernel void grucell_h_times_activation_r_BF16_BF16toBF16_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __write_only image2d_t output \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_short8 v0, v1, v2, v3, data0, data1; \\\n\ + float4 src0, src1, src2, src3; \\\n\ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + VXC_DP2x8(data1, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data1, 16); \\\n\ + VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + \\\n\ + float4 r; \\\n\ + r = src0 + src1; \\\n\ + r = act_func(r); \\\n\ + float4 result = r * src3; \\\n\ + _viv_asm(COPY, v0, result, 16); \\\n\ + VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_BF16(SIGMOID, sigmoid_func)\n\ +GRUCELL_BF16(HSIGMOID, hard_sigmoid)\n\ "; /* end of grucell_h_times_activation_r_vx*/ static const char grucell_reset_after_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -14046,6 +15349,9 @@ static const char grucell_reset_after_activation_vx[] = "#include \"cl_viv_vx_ex #define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ \n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ float4 sigmoid_func(float4 x)\n\ {\n\ x *= -logE;\n\ @@ -14193,6 +15499,68 @@ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func, GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8)\n\ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)\n\ +\n\ +#define GRUCELL_BF16(act_name, act_func, rec_act_name, rec_act_func) \\\n\ +__kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_short8 v0, v1, v2, v3, v4, v5, v6, data0, data1; \\\n\ + float4 src0, src1, src2, src3, src4, src5, src6; \\\n\ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + VXC_DP2x8(data1, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data1, 16); \\\n\ + VXC_DP2x8(data0, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, v4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src4, data1, 16); \\\n\ + VXC_DP2x8(data0, v5, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src5, data0, 16); \\\n\ + VXC_DP2x8(data1, v6, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src6, data1, 16); \\\n\ + VXC_DP2x8(data0, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + \\\n\ + float4 r; \\\n\ + r = src0 + src1; \\\n\ + r = rec_act_func(r); \\\n\ + float4 h = src4 + r * src2; \\\n\ + float4 z = src5 + src6; \\\n\ + h = act_func(h); \\\n\ + z = rec_act_func(z); \\\n\ + float4 result = (1 - z) * h + z * src3; \\\n\ + _viv_asm(COPY, v0, result, 16); \\\n\ + VXC_DP2x8(data0, v0, v0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(hstate_out, coord_in, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_BF16(TANH, tanh_func, SIGMOID, sigmoid_func)\n\ +GRUCELL_BF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)\n\ "; /* end of grucell_reset_after_activation_vx*/ static const char hswish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -19040,6 +20408,132 @@ TENSORLOGICAL_BFP16_2D(and, BF16, I8, vxc_short8, vxc_short8, vxc_short8, TENSORLOGICAL_BFP16_2D(xor, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)\n\ "; /* end of logical_ops_vx*/ +static const char lstmunit_activation_BP_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_BP_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_BP_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src10, data1, 16); \\\n\ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data0, 16); \\\n\ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src11, data1, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src12, data1, 16); \\\n\ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src13, data1, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + data_i_t = src0 + src10 + b0; \\\n\ + data_f_t = src1 + src11 + b1; \\\n\ + data_g_t = src2 + src12 + b2; \\\n\ + data_o_t = src3 + src13 + b3; \\\n\ + \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_BP_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_BP_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_BP_BF16_vx*/ + static const char lstmunit_activation_BP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -19460,6 +20954,134 @@ LSTMUNIT_BP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmo LSTMUNIT_BP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_BP_U8_vx*/ +static const char lstmunit_activation_B_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_B_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_B_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src10, data1, 16); \\\n\ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data0, 16); \\\n\ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src11, data1, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src12, data1, 16); \\\n\ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src13, data1, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + data_i_t = src0 + src10 + b0; \\\n\ + data_f_t = src1 + src11 + b1; \\\n\ + data_g_t = src2 + src12 + b2; \\\n\ + data_o_t = src3 + src13 + b3; \\\n\ + \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_B_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_B_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_B_BF16_vx*/ + static const char lstmunit_activation_B_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -19881,6 +21503,119 @@ LSTMUNIT_B_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoi LSTMUNIT_B_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_B_U8_vx*/ +static const char lstmunit_activation_CBP_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_CBP_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CBP_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data0, 16); \\\n\ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src11, data1, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src12, data1, 16); \\\n\ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src13, data1, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + data_f_t = src1 + src11 + b1; \\\n\ + data_g_t = src2 + src12 + b2; \\\n\ + data_o_t = src3 + src13 + b3; \\\n\ + \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CBP_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_CBP_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CBP_BF16_vx*/ + static const char lstmunit_activation_CBP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -20262,6 +21997,121 @@ LSTMUNIT_CBP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigm LSTMUNIT_CBP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_CBP_U8_vx*/ +static const char lstmunit_activation_CB_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_CB_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CB_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data0, 16); \\\n\ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src11, data1, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src12, data1, 16); \\\n\ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src13, data1, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + data_f_t = src1 + src11 + b1; \\\n\ + data_g_t = src2 + src12 + b2; \\\n\ + data_o_t = src3 + src13 + b3; \\\n\ + \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CB_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_CB_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CB_BF16_vx*/ + static const char lstmunit_activation_CB_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -20644,6 +22494,109 @@ LSTMUNIT_CB_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmo LSTMUNIT_CB_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_CB_U8_vx*/ +static const char lstmunit_activation_CLP_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\ +#define LSTMUNIT_CLP_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CLP_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, b0, b1, b2; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_f, coord_in.xw); \\\n\ + b1 = read_imagef(bias_c, coord_in.xw); \\\n\ + b2 = read_imagef(bias_o, coord_in.xw); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_f_t, data0, 16); \\\n\ + VXC_DP2x8(data1, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_g_t, data1, 16); \\\n\ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_o_t, data0, 16); \\\n\ + VXC_DP2x8(data1, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data1, 16); \\\n\ + \\\n\ + data_f_t = data_f_t * w0 + b0; \\\n\ + data_g_t = data_g_t * w1 + b1; \\\n\ + data_o_t = data_o_t * w2 + b2; \\\n\ + \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CLP_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_CLP_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CLP_BF16_vx*/ + static const char lstmunit_activation_CLP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -20825,6 +22778,110 @@ LSTMUNIT_CLP_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid LSTMUNIT_CLP_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_CLP_F16_vx*/ +static const char lstmunit_activation_CL_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_CL_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CL_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, b0, b1, b2; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + w0 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_f, coord_in.xw); \\\n\ + b1 = read_imagef(bias_c, coord_in.xw); \\\n\ + b2 = read_imagef(bias_o, coord_in.xw); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_f_t, data0, 16); \\\n\ + VXC_DP2x8(data1, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_g_t, data1, 16); \\\n\ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_o_t, data0, 16); \\\n\ + VXC_DP2x8(data1, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data1, 16); \\\n\ + \\\n\ + data_f_t = data_f_t * w0 + b0; \\\n\ + data_g_t = data_g_t * w1 + b1; \\\n\ + data_o_t = data_o_t * w2 + b2; \\\n\ + \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CL_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_CL_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CL_BF16_vx*/ + static const char lstmunit_activation_CL_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -21010,6 +23067,112 @@ LSTMUNIT_CL_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) LSTMUNIT_CL_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_CL_F16_vx*/ +static const char lstmunit_activation_CSP_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_CSP_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CSP_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data0, 16); \\\n\ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src11, data1, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src12, data1, 16); \\\n\ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src13, data1, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CSP_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_CSP_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CSP_BF16_vx*/ + static const char lstmunit_activation_CSP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -21357,6 +23520,114 @@ LSTMUNIT_CSP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigm LSTMUNIT_CSP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_CSP_U8_vx*/ +static const char lstmunit_activation_CS_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_CS_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_CS_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data0, 16); \\\n\ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src11, data1, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src12, data1, 16); \\\n\ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src13, data1, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = 1.0 - data_f_t; \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_CS_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_CS_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_CS_BF16_vx*/ + static const char lstmunit_activation_CS_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -21705,6 +23976,118 @@ LSTMUNIT_CS_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmo LSTMUNIT_CS_U8_FP16(F16, HARD_SIGMOID, half4, vxc_half4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_CS_U8_vx*/ +static const char lstmunit_activation_LP_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_LP_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_LP_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wi, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_i_t, data0, 16); \\\n\ + VXC_DP2x8(data1, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_f_t, data1, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_g_t, data0, 16); \\\n\ + VXC_DP2x8(data1, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_o_t, data1, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + \\\n\ + data_i_t = data_i_t * w0 + b0; \\\n\ + data_f_t = data_f_t * w1 + b1; \\\n\ + data_g_t = data_g_t * w2 + b2; \\\n\ + data_o_t = data_o_t * w3 + b3; \\\n\ + \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_LP_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_LP_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_LP_BF16_vx*/ + static const char lstmunit_activation_LP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -21904,6 +24287,120 @@ LSTMUNIT_LP_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmoid) LSTMUNIT_LP_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_LP_F16_vx*/ +static const char lstmunit_activation_L_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_L_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_L_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_t bias_i, \\\n\ + __read_only image2d_t bias_f, \\\n\ + __read_only image2d_t bias_c, \\\n\ + __read_only image2d_t bias_o, \\\n\ + __read_only image2d_t layer_norm_wi, \\\n\ + __read_only image2d_t layer_norm_wf, \\\n\ + __read_only image2d_t layer_norm_wc, \\\n\ + __read_only image2d_t layer_norm_wo, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 w0, w1, w2, w3, b0, b1, b2, b3; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + w0 = read_imagef(layer_norm_wi, coord_in.xw); \\\n\ + w1 = read_imagef(layer_norm_wf, coord_in.xw); \\\n\ + w2 = read_imagef(layer_norm_wc, coord_in.xw); \\\n\ + w3 = read_imagef(layer_norm_wo, coord_in.xw); \\\n\ + b0 = read_imagef(bias_i, coord_in.xw); \\\n\ + b1 = read_imagef(bias_f, coord_in.xw); \\\n\ + b2 = read_imagef(bias_c, coord_in.xw); \\\n\ + b3 = read_imagef(bias_o, coord_in.xw); \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_i_t, data0, 16); \\\n\ + VXC_DP2x8(data1, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_f_t, data1, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_g_t, data0, 16); \\\n\ + VXC_DP2x8(data1, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_o_t, data1, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + \\\n\ + data_i_t = data_i_t * w0 + b0; \\\n\ + data_f_t = data_f_t * w1 + b1; \\\n\ + data_g_t = data_g_t * w2 + b2; \\\n\ + data_o_t = data_o_t * w3 + b3; \\\n\ + \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_L_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_L_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_L_BF16_vx*/ + static const char lstmunit_activation_L_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -22108,6 +24605,125 @@ LSTMUNIT_L_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\ \n\ "; /* end of lstmunit_activation_L_F16_vx*/ +static const char lstmunit_activation_SP_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_SP_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_SP_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1, data2, data3; \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + vxc_float4 src0, src1, src2, src3; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_float4 src10, src11, src12, src13; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src10, data1, 16); \\\n\ + VXC_DP2x8(data2, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data2, 16); \\\n\ + VXC_DP2x8(data3, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src11, data3, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src12, data1, 16); \\\n\ + VXC_DP2x8(data2, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data2, 16); \\\n\ + VXC_DP2x8(data3, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src13, data3, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, data0, data_c_t, 16); \\\n\ + VXC_DP2x8(data1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, data0, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_SP_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_SP_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_SP_BF16_vx*/ + static const char lstmunit_activation_SP_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -22484,6 +25100,126 @@ LSTMUNIT_SP_U8_FP16(U8, HARD_SIGMOID, int4, vxc_uchar4, vxc_uchar4, hard_sigmo LSTMUNIT_SP_U8_FP16(I16, HARD_SIGMOID, int4, vxc_short4, vxc_short4, hard_sigmoid)\n\ "; /* end of lstmunit_activation_SP_U8_vx*/ +static const char lstmunit_activation_S_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float logE;\n\ +_viv_uniform float twoLogE;\n\ +_viv_uniform float forget_bias;\n\ +float4 sigmoid(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tangentH(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform float4 clip_Min_F;\n\ +_viv_uniform float4 clip_Max_F;\n\ +\n\ +#define LSTMUNIT_S_BF16(act_name, act_func) \\\n\ +__kernel void lstmunit_activation_S_BF16toBF16_BF16_##act_name( \\\n\ + __read_only image2d_array_t input_i_conv, \\\n\ + __read_only image2d_array_t input_f_conv, \\\n\ + __read_only image2d_array_t input_c_conv, \\\n\ + __read_only image2d_array_t input_o_conv, \\\n\ + __read_only image2d_t cell_state_in, \\\n\ + __read_only image2d_array_t hstate_i_conv, \\\n\ + __read_only image2d_array_t hstate_f_conv, \\\n\ + __read_only image2d_array_t hstate_c_conv, \\\n\ + __read_only image2d_array_t hstate_o_conv, \\\n\ + __write_only image2d_array_t output, \\\n\ + __write_only image2d_t cell_state_out, \\\n\ + __write_only image2d_t h_state_out, \\\n\ + int _is_ln, int _is_cifg, int _is_hybrid, float cell_clip, float forgetBias \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + vxc_short8 vect0, vect1, vect2, vect3, vect4, data0, data1; \\\n\ + vxc_float4 src0, src1, src2, src3, src4; \\\n\ + vxc_short8 vect10, vect11, vect12, vect13; \\\n\ + vxc_float4 src10, src11, src12, src13; \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + VXC_ReadImage(vect0, input_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect10, hstate_i_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect1, input_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect11, hstate_f_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect2, input_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect12, hstate_c_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect3, input_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect13, hstate_o_conv, coord_in.xy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(vect4, cell_state_in, coord_in.zy, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(data0, vect0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + VXC_DP2x8(data1, vect10, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src10, data1, 16); \\\n\ + VXC_DP2x8(data0, vect1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src1, data0, 16); \\\n\ + VXC_DP2x8(data1, vect11, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src11, data1, 16); \\\n\ + VXC_DP2x8(data0, vect2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src2, data0, 16); \\\n\ + VXC_DP2x8(data1, vect12, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src12, data1, 16); \\\n\ + VXC_DP2x8(data0, vect3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src3, data0, 16); \\\n\ + VXC_DP2x8(data1, vect13, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, src13, data1, 16); \\\n\ + VXC_DP2x8(data0, vect4, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, data_c_t, data0, 16); \\\n\ + \\\n\ + data_i_t = src0 + src10; \\\n\ + data_f_t = src1 + src11; \\\n\ + data_g_t = src2 + src12; \\\n\ + data_o_t = src3 + src13; \\\n\ + \\\n\ + data_i_t = act_func(data_i_t); \\\n\ + data_f_t = act_func(data_f_t + forget_bias); \\\n\ + data_g_t = tangentH(data_g_t); \\\n\ + data_i_t = data_i_t * data_g_t; \\\n\ + data_c_t = data_c_t * data_f_t + data_i_t; \\\n\ + data_o_t = act_func(data_o_t); \\\n\ + data_c_t = data_c_t > clip_Max_F ? clip_Max_F : data_c_t; \\\n\ + data_c_t = data_c_t < clip_Min_F ? clip_Min_F : data_c_t; \\\n\ + _viv_asm(COPY, vect0, data_c_t, 16); \\\n\ + VXC_DP2x8(data0, vect0, vect0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(cell_state_out, coord_in.zy, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + data_c_t = tangentH(data_c_t); \\\n\ + data_o_t = data_o_t * data_c_t * outputScale + outputZP; \\\n\ + _viv_asm(COPY, vect1, data_o_t, 16); \\\n\ + VXC_DP2x8(data1, vect1, vect1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(h_state_out, coord_in.zy, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +LSTMUNIT_S_BF16(SIGMOID, sigmoid)\n\ +LSTMUNIT_S_BF16(HARD_SIGMOID, hard_sigmoid)\n\ +"; /* end of lstmunit_activation_S_BF16_vx*/ + static const char lstmunit_activation_S_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float logE;\n\ @@ -27359,6 +30095,291 @@ MAXIMUM_QUANTTOF16_2D_IMPL(U8U8toF16, vxc_uchar16)\n\ MAXIMUM_QUANTTOF16_2D_IMPL(I8I8toF16, vxc_char16)\n\ MAXIMUM_QUANTTOF16_2D_IMPL(I16I16toF16, vxc_short8)"; /* end of maximum_1_vx*/ +static const char maxpool_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvF16toFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform float inout_scale;\n\ +_viv_uniform float inout_tail;\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +\n\ +#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val) \\\n\ +__kernel void maxpool_##in_name##to##out_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int stride_x, int stride_y, int pad_x, int pad_y, \\\n\ + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \\\n\ + int4 coord_in = coord_out; \\\n\ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \\\n\ + for(; pos_start.x < 0;) \\\n\ + { \\\n\ + pos_start.x += dilation_x; \\\n\ + } \\\n\ + for(; pos_start.y < 0;) \\\n\ + { \\\n\ + pos_start.y += dilation_y; \\\n\ + } \\\n\ + pos_end = min(pos_end, (int2)(width, height)); \\\n\ + \\\n\ + src_type src0; \\\n\ + dst_type maxVal; \\\n\ + maxVal.x = max_val; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr_a); \\\n\ + \\\n\ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \\\n\ + { \\\n\ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x += dilation_x; \\\n\ + VXC_VertMax3_Integer(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + float4 fValTmp; \\\n\ + fValTmp.x = maxVal.x * inout_scale + inout_tail; \\\n\ + int4 i4Val = convert_int4_rte(fValTmp); \\\n\ + VXC_DP2x8(maxVal, i4Val, i4Val, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +MAXPOOL_QINT(U8, U8, vxc_uchar8, vxc_uchar8, 0)\n\ +MAXPOOL_QINT(I8, I8, vxc_char8, vxc_char8, -128)\n\ +MAXPOOL_QINT(I16, I16, vxc_short8, vxc_short8, -32768)\n\ +\n\ +__kernel void maxpool_F16toF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int stride_x, int stride_y, int pad_x, int pad_y,\n\ + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);\n\ + int4 coord_in = coord_out;\n\ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);\n\ + for(; pos_start.x < 0;)\n\ + {\n\ + pos_start.x += dilation_x;\n\ + }\n\ + for(; pos_start.y < 0;)\n\ + {\n\ + pos_start.y += dilation_y;\n\ + }\n\ + pos_end = min(pos_end, (int2)(width, height));\n\ +\n\ + vxc_short8 data0;\n\ + vxc_half8 maxVal, src0;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr_a);\n\ + coord_in.xy = pos_start;\n\ +\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, maxVal, data0, 16);\n\ +\n\ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)\n\ + {\n\ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)\n\ + {\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x += dilation_x;\n\ + _viv_asm(COPY, src0, data0, 16);\n\ + VXC_VertMax3_Half(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + _viv_asm(COPY, data0, maxVal, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define MAXPOOL_F16_TO_QINT(out_name, dst_type) \\\n\ +__kernel void maxpool_F16to##out_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int stride_x, int stride_y, int pad_x, int pad_y, \\\n\ + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \\\n\ + int4 coord_in = coord_out; \\\n\ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \\\n\ + for(; pos_start.x < 0;) \\\n\ + { \\\n\ + pos_start.x += dilation_x; \\\n\ + } \\\n\ + for(; pos_start.y < 0;) \\\n\ + { \\\n\ + pos_start.y += dilation_y; \\\n\ + } \\\n\ + pos_end = min(pos_end, (int2)(width, height)); \\\n\ + \\\n\ + vxc_short8 data0; \\\n\ + vxc_half8 maxVal, src0; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr_a); \\\n\ + coord_in.xy = pos_start; \\\n\ + \\\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, maxVal, data0, 16); \\\n\ + \\\n\ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \\\n\ + { \\\n\ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x += dilation_x; \\\n\ + _viv_asm(COPY, src0, data0, 16); \\\n\ + VXC_VertMax3_Half(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + float4 fValTmp; \\\n\ + VXC_DP4x4(fValTmp, maxVal, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvF16toFp32_4x4); \\\n\ + fValTmp.x = fValTmp.x * inout_scale + inout_tail; \\\n\ + int4 i4Val = convert_int4_rte(fValTmp); \\\n\ + dst_type dst; \\\n\ + VXC_DP2x8(dst, i4Val, i4Val, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +\n\ +MAXPOOL_F16_TO_QINT(U8, vxc_uchar8)\n\ +MAXPOOL_F16_TO_QINT(I8, vxc_char8)\n\ +MAXPOOL_F16_TO_QINT(I16, vxc_short8)\n\ +\n\ +#define MAXPOOL_QINT_TO_F16(in_name, src_type, max_val) \\\n\ +__kernel void maxpool_##in_name##toF16( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int stride_x, int stride_y, int pad_x, int pad_y, \\\n\ + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \\\n\ + int4 coord_in = coord_out; \\\n\ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \\\n\ + for(; pos_start.x < 0;) \\\n\ + { \\\n\ + pos_start.x += dilation_x; \\\n\ + } \\\n\ + for(; pos_start.y < 0;) \\\n\ + { \\\n\ + pos_start.y += dilation_y; \\\n\ + } \\\n\ + pos_end = min(pos_end, (int2)(width, height)); \\\n\ + \\\n\ + src_type src0, maxVal; \\\n\ + maxVal.x = max_val; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr_a); \\\n\ + \\\n\ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \\\n\ + { \\\n\ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x += dilation_x; \\\n\ + VXC_VertMax3_Integer(maxVal, src0, src0, maxVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + float4 fValTmp; \\\n\ + fValTmp.x = maxVal.x * inout_scale + inout_tail; \\\n\ + half4 h4Val; \\\n\ + _viv_asm(CONV, h4Val, fValTmp); \\\n\ + vxc_short8 dst; \\\n\ + _viv_asm(COPY, dst, h4Val, 4); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +MAXPOOL_QINT_TO_F16(U8, vxc_uchar8, 0)\n\ +MAXPOOL_QINT_TO_F16(I8, vxc_char8, -128)\n\ +MAXPOOL_QINT_TO_F16(I16, vxc_short8, -32768)\n\ +\n\ +__kernel void maxpool_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int stride_x, int stride_y, int pad_x, int pad_y,\n\ + int kernel_dia_x, int kernel_dia_y, int dilation_x, int dilation_y)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);\n\ + int4 coord_in = coord_out;\n\ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);\n\ + for(; pos_start.x < 0;)\n\ + {\n\ + pos_start.x += dilation_x;\n\ + }\n\ + for(; pos_start.y < 0;)\n\ + {\n\ + pos_start.y += dilation_y;\n\ + }\n\ + pos_end = min(pos_end, (int2)(width, height));\n\ +\n\ + vxc_short8 data0, val0;\n\ + float4 maxVal, src0;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr_a);\n\ + coord_in.xy = pos_start;\n\ +\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +\n\ + VXC_DP2x8(val0, data0, zero, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, maxVal, val0, 4);\n\ +\n\ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)\n\ + {\n\ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)\n\ + {\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x += dilation_x;\n\ + VXC_DP2x8(val0, data0, zero, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, src0, val0, 4);\n\ + maxVal = max(src0, maxVal);\n\ + }\n\ + }\n\ + _viv_asm(COPY, data0, maxVal, 16);\n\ + VXC_DP2x8(val0, data0, data0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord_out, val0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of maxpool_vx*/ + static const char minimum_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void minimum_F16F16toF16\n\ @@ -32907,6 +35928,8 @@ _viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\ \n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractYtoShortSub16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;\n\ \n\ #define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ __kernel void pre_process_nv12_copy_##name \\\n\ @@ -32947,14 +35970,24 @@ __kernel void pre_process_nv12_copy_##name \\\n\ UV.s0123 = UV.s1032; \\\n\ } \\\n\ \\\n\ + vxc_short8 tmpY; \\\n\ vxc_char16 tmpUV; \\\n\ - short tmpVal = 128; \\\n\ + short tmpVal = 16; \\\n\ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_2x8); \\\n\ + tmpVal = 128; \\\n\ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\ \\\n\ float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \\\n\ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ \\\n\ conv_type result; \\\n\ dst_type dst0; \\\n\ @@ -33011,9 +36044,11 @@ _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ \n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertYtoShortSub16_2x8;\n\ \n\ _viv_uniform VXC_512Bits uniCalculateYShift_2x8;\n\ _viv_uniform VXC_512Bits uniCalculateUVShift_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;\n\ \n\ #define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ __kernel void pre_process_nv12_scale_##name##_gq \\\n\ @@ -33074,14 +36109,24 @@ __kernel void pre_process_nv12_scale_##name##_gq \\\n\ VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ + vxc_short8 tmpY; \\\n\ vxc_char16 tmpUV; \\\n\ - short tmpVal = 128; \\\n\ + short tmpVal = 16; \\\n\ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \\\n\ + tmpVal = 128; \\\n\ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\ \\\n\ float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \\\n\ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ \\\n\ conv_type result; \\\n\ dst_type dst0; \\\n\ @@ -33170,14 +36215,24 @@ __kernel void pre_process_nv12_scale_##name \\\n\ UV.s01234567 = UV.s10325476; \\\n\ } \\\n\ \\\n\ + vxc_short8 tmpY; \\\n\ vxc_char16 tmpUV; \\\n\ - short tmpVal = 128; \\\n\ + short tmpVal = 16; \\\n\ + VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \\\n\ + tmpVal = 128; \\\n\ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\ \\\n\ float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \\\n\ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ \\\n\ conv_type result; \\\n\ dst_type dst0; \\\n\ @@ -35135,7 +38190,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - int4 coord_out = coord; \\\n\ + int4 coord_out = coord.wwzw; \\\n\ coord_out.xyw += rgb_order.xyz; \\\n\ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\ rMean * r_scale * output_scale - output_zp, \\\n\ @@ -36807,6 +39862,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;\n\ \n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform VXC_512Bits uniExtractYUVtoShortSub_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;\n\ \n\ #define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\ __kernel void pre_process_yuv422_copy_##name \\\n\ @@ -36845,11 +39901,21 @@ __kernel void pre_process_yuv422_copy_##name \\\n\ } \\\n\ \\\n\ float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\ vxc_short2 value = (vxc_short2)(128,16); \\\n\ VXC_DP2x8(tmpYUV, YUV, value, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYUVtoShortSub_2x8); \\\n\ - VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \\\n\ - VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \\\n\ - VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \\\n\ + VXC_DP4x4(DstB_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertYUV422toB_4x4); \\\n\ + VXC_DP4x4(DstG_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertYUV422toG_4x4); \\\n\ + VXC_DP4x4(DstR_uchar, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertYUV422toR_4x4); \\\n\ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ \\\n\ conv_type result; \\\n\ dst_type dst0; \\\n\ @@ -36905,6 +39971,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\ _viv_uniform VXC_512Bits uniExtractYtoShortSub16_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;\n\ \n\ #define uyvy422 1\n\ \n\ @@ -36954,8 +40021,8 @@ __kernel void pre_process_yuv422_scale_##name \\\n\ } \\\n\ \\\n\ int4 coord_Y = (int4)(sx.x + y_offset, sy, 0, 0); \\\n\ - int4 coord_U = (int4)((sx.x >> 1) * 2 + u_offset, sy, 0, 0); \\\n\ - int4 coord_V = (int4)((sx.x >> 1) * 2 + v_offset, sy, 0, 0); \\\n\ + int4 coord_U = (int4)((sx.x >> 2) * 4 + u_offset, sy, 0, 0); \\\n\ + int4 coord_V = (int4)((sx.x >> 2) * 4 + v_offset, sy, 0, 0); \\\n\ \\\n\ VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_Y.x = sx.y + y_offset; \\\n\ @@ -36965,7 +40032,7 @@ __kernel void pre_process_yuv422_scale_##name \\\n\ coord_Y.x = sx.w + y_offset; \\\n\ VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - sx = (sx >> 1) * 2 + u_offset; \\\n\ + sx = (sx >> 2) * 4 + u_offset; \\\n\ VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_U.x = sx.y; \\\n\ VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -36989,9 +40056,19 @@ __kernel void pre_process_yuv422_scale_##name \\\n\ VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ \\\n\ float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ - VXC_DP4x4(tmpDstB, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \\\n\ - VXC_DP4x4(tmpDstG, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \\\n\ - VXC_DP4x4(tmpDstR, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \\\n\ + vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\ + VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertYUV422toB_4x4); \\\n\ + VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertYUV422toG_4x4); \\\n\ + VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\\\n\ + uniConvertYUV422toR_4x4); \\\n\ + VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ + VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\ \\\n\ conv_type result; \\\n\ dst_type dst0; \\\n\ @@ -40023,8 +43100,8 @@ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8_2D( \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ src0_type src0; \\\n\ src0_copy_type srcA; \\\n\ - src0_type src1; \\\n\ - src0_copy_type srcB; \\\n\ + src1_type src1; \\\n\ + src1_copy_type srcB; \\\n\ VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, srcA, src0, 16); \\\n\ VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -40180,8 +43257,8 @@ __kernel void func_name##_##src0_type_name##src1_type_name##toBOOL8( \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ src0_type src0; \\\n\ src0_copy_type srcA; \\\n\ - src0_type src1; \\\n\ - src0_copy_type srcB; \\\n\ + src1_type src1; \\\n\ + src1_copy_type srcB; \\\n\ VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, srcA, src0, 16); \\\n\ VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -46275,37 +49352,40 @@ __kernel void scatter_nd_update_reset_##name0##to##name1( \\\n\ Image img1 = create_image_from_image2d(input_ref, size0); \\\n\ Image img2 = create_image_from_image2d(temp_ref, size1); \\\n\ Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\ - __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \\\n\ - __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \\\n\ __global int* tmp_update_ptr = (__global int*)img3.ptr; \\\n\ - ptr0 tmpData = input_ptr[gidx]; \\\n\ - int4 zeros = (int4)(0); \\\n\ - int loc2 = gidx * 8; \\\n\ type0 src; \\\n\ type1 tmpDst; \\\n\ - ptr1 dst; \\\n\ vxc_ushort8 ms0; \\\n\ _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ - _viv_asm(COPY, src, tmpData, len0); \\\n\ - VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ - uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - _viv_asm(COPY, dst, tmpDst, len1); \\\n\ - output_ptr[gidx] = dst; \\\n\ - vstore4(zeros, 0, tmp_update_ptr + loc2); \\\n\ - vstore4(zeros, 1, tmp_update_ptr + loc2); \\\n\ - if(gidx < res) \\\n\ + if(length > 0) \\\n\ { \\\n\ - __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \\\n\ - __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \\\n\ - ptr2 tmpData1 = input_ptr1[length + gidx]; \\\n\ + __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \\\n\ + __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \\\n\ + ptr0 tmpData = input_ptr[gidx]; \\\n\ + int4 zeros = (int4)(0); \\\n\ + int loc2 = gidx * 8; \\\n\ + ptr1 dst; \\\n\ + _viv_asm(COPY, src, tmpData, len0); \\\n\ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst, tmpDst, len1); \\\n\ + output_ptr[gidx] = dst; \\\n\ + vstore4(zeros, 0, tmp_update_ptr + loc2); \\\n\ + vstore4(zeros, 1, tmp_update_ptr + loc2); \\\n\ + } \\\n\ + __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \\\n\ + __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \\\n\ + for(int i = gidx; i < res; i += get_global_size(0)) \\\n\ + { \\\n\ + ptr2 tmpData1 = input_ptr1[length + i]; \\\n\ ptr3 dst1; \\\n\ dst1 ^= dst1; \\\n\ - tmp_update_ptr[length + gidx] = 0; \\\n\ + tmp_update_ptr[length + i] = 0; \\\n\ _viv_asm(COPY, src, tmpData1, 4); \\\n\ VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ _viv_asm(COPY, dst1, tmpDst, len3); \\\n\ - output_ptr1[length + gidx] = dst1; \\\n\ + output_ptr1[length + i] = dst1; \\\n\ } \\\n\ }\n\ SCATTER_RESET(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1)\n\ @@ -46493,14 +49573,17 @@ __kernel void scatter_nd_update_copy_##src0_type( \\\n\ int gidx = get_global_id(0); \\\n\ Image img1 = create_image_from_image2d(temp_ref, element_size); \\\n\ Image img2 = create_image_from_image2d(output, element_size); \\\n\ - __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \\\n\ - __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \\\n\ - output_ptr[gidx] = input_ptr[gidx]; \\\n\ - if(gidx < res) \\\n\ + if(length > 0) \\\n\ { \\\n\ - __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \\\n\ - __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \\\n\ - output_ptr1[length + gidx] = input_ptr1[length + gidx]; \\\n\ + __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \\\n\ + __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \\\n\ + output_ptr[gidx] = input_ptr[gidx]; \\\n\ + } \\\n\ + __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \\\n\ + __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \\\n\ + for(int i = gidx; i < res; i += get_global_size(0)) \\\n\ + { \\\n\ + output_ptr1[length + i] = input_ptr1[length + i]; \\\n\ } \\\n\ }\n\ SCATTER_ND_UPDATE_COPY(U8, vxc_uchar8, 1, uchar)\n\ @@ -51935,6 +55018,8 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\ \\\n\ src_type sum = (src_type)(0); \\\n\ uint4 dst = (uint4)(0); \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst.x = convert_uint_sat(tmp_zp); \\\n\ \\\n\ float cnt = 0.0f; \\\n\ \\\n\ @@ -52099,6 +55184,8 @@ __kernel void cumsum_##name##toU8_axis1( \\\n\ \\\n\ src_type sum = (src_type)(0); \\\n\ uint4 dst = (uint4)(0); \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst.x = convert_uint_sat(tmp_zp); \\\n\ \\\n\ float cnt = 0; \\\n\ \\\n\ @@ -52263,6 +55350,8 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\ \\\n\ src_type sum = (src_type)(0); \\\n\ uint4 dst = (uint4)(0); \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst.x = convert_uint_sat(tmp_zp); \\\n\ \\\n\ float cnt = 0; \\\n\ \\\n\ @@ -52334,7 +55423,8 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\ } \\\n\ }\n\ CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)\n\ -CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)"; /* end of cumsum_cl*/ +CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)\n\ +"; /* end of cumsum_cl*/ static const char cumsum_2d_cl[] = "\n\ __kernel void cumsum_F32toF32_axis1_2D(\n\ @@ -52422,13 +55512,16 @@ __kernel void cumsum_U8toU8_axis1_2D(\n\ \n\ uint4 sum = (uint4)(0);\n\ uint4 dst = (uint4)(0);\n\ +\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ \n\ float cnt = 0;\n\ \n\ if(exclusive && rev)\n\ {\n\ coord.w = height - 1;\n\ - write_imageui(output, coord.zw, sum);\n\ + write_imageui(output, coord.zw, dst);\n\ for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ {\n\ uint4 data = read_imageui(input, coord.xy);\n\ @@ -52445,7 +55538,7 @@ __kernel void cumsum_U8toU8_axis1_2D(\n\ }\n\ else if(exclusive)\n\ {\n\ - write_imageui(output, coord.zw, sum);\n\ + write_imageui(output, coord.zw, dst);\n\ for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ {\n\ uint4 data = read_imageui(input, coord.xy);\n\ @@ -52511,6 +55604,8 @@ __kernel void cumsum_F32toU8_axis1_2D(\n\ \n\ float4 sum = (float4)(0);\n\ uint4 dst = (uint4)(0);\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ \n\ float cnt = 0;\n\ \n\ @@ -52668,6 +55763,9 @@ __kernel void cumsum_U8toU8_axis0_2D(\n\ \n\ uint4 sum = (uint4)(0);\n\ uint4 dst = (uint4)(0);\n\ +\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ \n\ float cnt = 0.0f;\n\ \n\ @@ -52675,7 +55773,7 @@ __kernel void cumsum_U8toU8_axis0_2D(\n\ {\n\ coord.x = width - 1;\n\ coord.z = coord.x;\n\ - write_imageui(output, coord.zw, sum);\n\ + write_imageui(output, coord.zw, dst);\n\ for(; coord.x > 0; coord.x--)\n\ {\n\ uint4 data = read_imageui(input, coord.xy);\n\ @@ -52693,7 +55791,7 @@ __kernel void cumsum_U8toU8_axis0_2D(\n\ else if(exclusive)\n\ {\n\ coord.z = 0;\n\ - write_imageui(output, coord.zw, sum);\n\ + write_imageui(output, coord.zw, dst);\n\ for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ {\n\ uint4 data = read_imageui(input, coord.xy);\n\ @@ -52759,9 +55857,10 @@ __kernel void cumsum_F32toU8_axis0_2D(\n\ \n\ float4 sum = (float4)(0);\n\ uint4 dst = (uint4)(0);\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ \n\ float cnt = 0.0f;\n\ -\n\ if(exclusive && rev)\n\ {\n\ coord.x = width - 1;\n\ @@ -52829,7 +55928,8 @@ __kernel void cumsum_F32toU8_axis0_2D(\n\ write_imageui(output, coord.xy, dst);\n\ }\n\ }\n\ -}"; /* end of cumsum_2d_cl*/ +}\n\ +"; /* end of cumsum_2d_cl*/ static const char depth2space_crd_cl[] = "\n\ __kernel void depth2space_crd_F32toF32(\n\ @@ -58061,23 +61161,21 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layer_norm_U8toU8( }\n\ "; /* end of layer_normalization_cl*/ -static const char log_softmax_axis0_cl[] = "#define rlogE (0.693147182f)\n\ +static const char log_softmax_axis0_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define rlogE (0.693147182f)\n\ float LOG(float x)\n\ {\n\ x = log2(x);\n\ return x * rlogE;\n\ }\n\ \n\ -__kernel void log_softmax_axis0_F32toF32\n\ - (\n\ +__kernel void log_softmax_axis0_F32toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ - int axis,\n\ - float beta,\n\ - float scale,\n\ - float scaleOut,\n\ - float zpOut\n\ - )\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ {\n\ int x = get_global_id(0);\n\ int y = get_global_id(1);\n\ @@ -58121,16 +61219,11 @@ __kernel void log_softmax_axis0_F32toF32\n\ }\n\ }\n\ \n\ -__kernel void log_softmax_axis0_F32toF32_2D\n\ - (\n\ +__kernel void log_softmax_axis0_F32toF32_2D(\n\ __read_only image2d_t input,\n\ __write_only image2d_t output,\n\ - int axis,\n\ - float beta,\n\ - float scale,\n\ - float scaleOut,\n\ - float zpOut\n\ - )\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ {\n\ int x = get_global_id(0);\n\ int y = get_global_id(1);\n\ @@ -58173,16 +61266,11 @@ __kernel void log_softmax_axis0_F32toF32_2D\n\ }\n\ }\n\ \n\ -__kernel void log_softmax_axis0_U8toU8\n\ - (\n\ +__kernel void log_softmax_axis0_U8toU8(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ - int axis,\n\ - float beta,\n\ - float scale,\n\ - float scaleOut,\n\ - float zpOut\n\ - )\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ {\n\ int x = get_global_id(0);\n\ int y = get_global_id(1);\n\ @@ -58228,16 +61316,11 @@ __kernel void log_softmax_axis0_U8toU8\n\ }\n\ }\n\ \n\ -__kernel void log_softmax_axis0_U8toU8_2D\n\ - (\n\ +__kernel void log_softmax_axis0_U8toU8_2D(\n\ __read_only image2d_t input,\n\ __write_only image2d_t output,\n\ - int axis,\n\ - float beta,\n\ - float scale,\n\ - float scaleOut,\n\ - float zpOut\n\ - )\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ {\n\ int x = get_global_id(0);\n\ int y = get_global_id(1);\n\ @@ -58280,10 +61363,118 @@ __kernel void log_softmax_axis0_U8toU8_2D\n\ coord_in.x++;\n\ }\n\ }\n\ +\n\ +__kernel void log_softmax_axis0_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int width = get_image_width(input);\n\ + int4 coord_in = (int4)(0, y, z, 0);\n\ + float4 maxValue, src, dst = {0.0};\n\ + uint4 data, val, out;\n\ +\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, maxValue, data, 16);\n\ + for (coord_in.x = 1; coord_in.x < width; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.x++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + float sum = 0.f;\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.x++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + float logSum = LOG(sum);\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + _viv_asm(COPY, val, dst, 16);\n\ + out = val >> 16;\n\ + write_imageui(output, coord_in, out);\n\ + coord_in.x++;\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_axis0_BF16toBF16_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int width = get_image_width(input);\n\ + int2 coord_in = (int2)(0, y);\n\ + float4 maxValue, src, dst = {0.0};\n\ + uint4 data, val, out;\n\ +\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, maxValue, data, 16);\n\ + for (coord_in.x = 1; coord_in.x < width; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.x++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + float sum = 0.0f;\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.x++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + float logSum = LOG(sum);\n\ + for (coord_in.x = 0; coord_in.x < width; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + _viv_asm(COPY, val, dst, 16);\n\ + out = val >> 16;\n\ + write_imageui(output, coord_in, out);\n\ + coord_in.x++;\n\ + }\n\ +}\n\ #undef rlogE\n\ "; /* end of log_softmax_axis0_cl*/ -static const char log_softmax_axis1_cl[] = "#define rlogE (0.693147182f)\n\ +static const char log_softmax_axis1_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define rlogE (0.693147182f)\n\ \n\ float LOG(float x)\n\ {\n\ @@ -58291,16 +61482,11 @@ float LOG(float x)\n\ return x * rlogE;\n\ }\n\ \n\ -__kernel void log_softmax_axis1_F32toF32\n\ - (\n\ +__kernel void log_softmax_axis1_F32toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ - int axis,\n\ - float beta,\n\ - float scale,\n\ - float scaleOut,\n\ - float zpOut\n\ - )\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ {\n\ int x = get_global_id(0);\n\ int y = get_global_id(1);\n\ @@ -58344,16 +61530,11 @@ __kernel void log_softmax_axis1_F32toF32\n\ }\n\ }\n\ \n\ -__kernel void log_softmax_axis1_F32toF32_2D\n\ - (\n\ +__kernel void log_softmax_axis1_F32toF32_2D(\n\ __read_only image2d_t input,\n\ __write_only image2d_t output,\n\ - int axis,\n\ - float beta,\n\ - float scale,\n\ - float scaleOut,\n\ - float zpOut\n\ - )\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ {\n\ int x = get_global_id(0);\n\ int y = get_global_id(1);\n\ @@ -58396,16 +61577,11 @@ __kernel void log_softmax_axis1_F32toF32_2D\n\ }\n\ }\n\ \n\ -__kernel void log_softmax_axis1_U8toU8\n\ - (\n\ +__kernel void log_softmax_axis1_U8toU8(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ - int axis,\n\ - float beta,\n\ - float scale,\n\ - float scaleOut,\n\ - float zpOut\n\ - )\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ {\n\ int x = get_global_id(0);\n\ int y = get_global_id(1);\n\ @@ -58451,16 +61627,11 @@ __kernel void log_softmax_axis1_U8toU8\n\ }\n\ }\n\ \n\ -__kernel void log_softmax_axis1_U8toU8_2D\n\ - (\n\ +__kernel void log_softmax_axis1_U8toU8_2D(\n\ __read_only image2d_t input,\n\ __write_only image2d_t output,\n\ - int axis,\n\ - float beta,\n\ - float scale,\n\ - float scaleOut,\n\ - float zpOut\n\ - )\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ {\n\ int x = get_global_id(0);\n\ int y = get_global_id(1);\n\ @@ -58503,10 +61674,120 @@ __kernel void log_softmax_axis1_U8toU8_2D\n\ coord_in.y++;\n\ }\n\ }\n\ +\n\ +__kernel void log_softmax_axis1_BF16oBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int height = get_image_height(input);\n\ + int4 coord_in = (int4)(x, 0, z, 0);\n\ + float4 maxValue, src, dst = {0.0};\n\ + uint4 data, val, out;\n\ +\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, maxValue, data, 16);\n\ + for (coord_in.y = 1; coord_in.y < height; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.y++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + float sum = 0.f;\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.y++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + float logSum = LOG(sum);\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ +\n\ + _viv_asm(COPY, val, dst, 16);\n\ + out = val >> 16;\n\ +\n\ + write_imageui(output, coord_in, out);\n\ + coord_in.y++;\n\ + }\n\ +}\n\ +\n\ +__kernel void log_softmax_axis1_BF16toBF16_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, float beta,\n\ + float scale, float scaleOut, float zpOut)\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int height = get_image_height(input);\n\ + int2 coord_in = (int2)(x, 0);\n\ + float4 maxValue, src, dst = {0.0};\n\ + uint4 data, val, out;\n\ +\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, maxValue, data, 16);\n\ + for (coord_in.y = 1; coord_in.y < height; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.y++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + float sum = 0.0f;\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.y++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + float logSum = 1.0f * LOG(sum);\n\ + for (coord_in.y = 0; coord_in.y < height; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + _viv_asm(COPY, val, dst, 16);\n\ + out = val >> 16;\n\ + write_imageui(output, coord_in, out);\n\ + coord_in.y++;\n\ + }\n\ +}\n\ #undef rlogE\n\ "; /* end of log_softmax_axis1_cl*/ -static const char log_softmax_axis2_cl[] = "#define rlogE (0.693147182f)\n\ +static const char log_softmax_axis2_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define rlogE (0.693147182f)\n\ float LOG(float x)\n\ {\n\ x = log2(x);\n\ @@ -58620,6 +61901,70 @@ __kernel void log_softmax_axis2_U8toU8\n\ coord_in.z++;\n\ }\n\ }\n\ +\n\ +__kernel void log_softmax_axis2_BF16toBF16\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float beta,\n\ + float scale,\n\ + float scaleOut,\n\ + float zpOut\n\ + )\n\ +{\n\ + int x = get_global_id(0);\n\ + int y = get_global_id(1);\n\ + int z = get_global_id(2);\n\ + int depth = get_image_array_size(input);\n\ + int4 coord_in = (int4)(x, y, 0, 0);\n\ + float4 maxValue;\n\ + float4 src, dst = {0.0};\n\ + uint4 data, val, out;\n\ +\n\ + // Find max element value which we'll use to ensure numerical stability\n\ + // taking advantage of the following equality:\n\ + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, maxValue, data, 16);\n\ + for (coord_in.z = 1; coord_in.z < depth; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.z++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + maxValue = maxValue > src ? maxValue : src;\n\ + }\n\ +\n\ + // Compute sum.\n\ + float sum = 0.f;\n\ + for (coord_in.z = 0; coord_in.z < depth; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + coord_in.z++;\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + sum += exp2((src.x - maxValue.x) * scale);\n\ + }\n\ +\n\ + // Compute result.\n\ + float logSum = LOG(sum);\n\ + for (coord_in.z = 0; coord_in.z < depth; )\n\ + {\n\ + data = read_imageui(input, coord_in);\n\ + data = data << 16;\n\ + _viv_asm(COPY, src, data, 16);\n\ +\n\ + dst.x = (src.x - maxValue.x) * beta - logSum;\n\ + _viv_asm(COPY, val, dst, 16);\n\ + out = val >> 16;\n\ + write_imageui(output, coord_in, out);\n\ + coord_in.z++;\n\ + }\n\ +}\n\ #undef rlogE\n\ "; /* end of log_softmax_axis2_cl*/ @@ -62123,6 +65468,135 @@ GEMM_TRANSB_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\ \n\ "; /* end of matrixmul_cl*/ +static const char matrixmul_4x_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ +\n\ +__kernel void gemm_4x_F32F32toF32_2D(\n\ + __read_only image2d_t inputA,\n\ + __read_only image2d_t inputB,\n\ + __write_only image2d_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ + )\n\ +{\n\ + int offset0 = get_global_id(0) * K;\n\ + int offset1 = offset0 + K;\n\ + int offset2 = offset1 + K;\n\ + int offset3 = offset2 + K;\n\ + int out_offset = get_global_id(0);\n\ + int z = 0;\n\ + float4 sum = (float4)(0, 0, 0, 0);\n\ +\n\ + Image in0_tensor = create_image_from_image2d(inputA, 4);\n\ + __global float* in0_ptr = (__global float*)in0_tensor.ptr;\n\ + __global float* in0_ptr0 = in0_ptr + offset0;\n\ + __global float* in0_ptr1 = in0_ptr + offset1;\n\ + __global float* in0_ptr2 = in0_ptr + offset2;\n\ + __global float* in0_ptr3 = in0_ptr + offset3;\n\ +\n\ + Image in1_tensor = create_image_from_image2d(inputB, 4);\n\ + __global float* in1_ptr = (__global float*)in1_tensor.ptr;\n\ +\n\ + Image o_tensor = create_image_from_image2d(output, 4);\n\ + __global float* output_ptr = (__global float*)o_tensor.ptr + out_offset;\n\ +\n\ + int step = K >> 2;\n\ + for(z = 0; z < step; z++)\n\ + {\n\ + float4 tempA0, tempA1, tempA2, tempA3;\n\ + float4 tempB0;\n\ +\n\ + tempB0 = vload4(z, in1_ptr);\n\ + tempA0 = vload4(z, in0_ptr0);\n\ + tempA1 = vload4(z, in0_ptr1);\n\ + tempA2 = vload4(z, in0_ptr2);\n\ + tempA3 = vload4(z, in0_ptr3);\n\ +\n\ + sum.x += dot(tempA0, tempB0);\n\ + sum.y += dot(tempA1, tempB0);\n\ + sum.z += dot(tempA2, tempB0);\n\ + sum.w += dot(tempA3, tempB0);\n\ + }\n\ +\n\ + vstore4(sum, 0, output_ptr);\n\ +\n\ +}\n\ +\n\ +__kernel void gemm_4x_transa_F32F32toF32_2D(\n\ + __read_only image2d_t inputA,\n\ + __read_only image2d_t inputB,\n\ + __write_only image2d_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out\n\ + )\n\ +{\n\ + int offset0 = get_global_id(0);\n\ + int offset1 = M << 2;\n\ +\n\ + int z = 0;\n\ + float4 sum = (float4)(0, 0, 0, 0);\n\ +\n\ + Image in0_tensor = create_image_from_image2d(inputA, 4);\n\ + __global float* in0_ptr0 = (__global float*)in0_tensor.ptr + offset0;\n\ + __global float* in0_ptr1 = in0_ptr0 + M;\n\ + __global float* in0_ptr2 = in0_ptr1 + M;\n\ + __global float* in0_ptr3 = in0_ptr2 + M;\n\ +\n\ + Image in1_tensor = create_image_from_image2d(inputB, 4);\n\ + __global float* in1_ptr = (__global float*)in1_tensor.ptr;\n\ +\n\ + Image o_tensor = create_image_from_image2d(output, 4);\n\ + __global float* output_ptr = (__global float*)o_tensor.ptr + offset0;\n\ +\n\ + int step = K >> 2;\n\ + for(z = 0; z < step; z++)\n\ + {\n\ + float4 tempA0, tempA1, tempA2, tempA3;\n\ + float4 tempB0;\n\ +\n\ + tempB0 = vload4(z, in1_ptr);\n\ + tempA0 = vload4(0, in0_ptr0);\n\ + tempA1 = vload4(0, in0_ptr1);\n\ + tempA2 = vload4(0, in0_ptr2);\n\ + tempA3 = vload4(0, in0_ptr3);\n\ +\n\ + sum += tempA0 * tempB0.x;\n\ + sum += tempA1 * tempB0.y;\n\ + sum += tempA2 * tempB0.z;\n\ + sum += tempA3 * tempB0.w;\n\ +\n\ + in0_ptr0 = in0_ptr0 + offset1;\n\ + in0_ptr1 = in0_ptr1 + offset1;\n\ + in0_ptr2 = in0_ptr2 + offset1;\n\ + in0_ptr3 = in0_ptr3 + offset1;\n\ +\n\ + }\n\ +\n\ + vstore4(sum, 0, output_ptr);\n\ +\n\ +}\n\ +\n\ +\n\ +\n\ +"; /* end of matrixmul_4x_cl*/ + static const char matrixmul_cross_cl[] = "__kernel void gemm_F32F32toF32_merge(\n\ __read_only image2d_array_t inputA,\n\ __read_only image2d_array_t inputB,\n\ @@ -62552,6 +66026,225 @@ __kernel void maximum_I32I32toI32_2D\n\ }\n\ "; /* end of maximum_cl*/ +static const char maxpool_cl[] = "#define VSI_FLOAT32_MIN (1.175494351e-38F)\n\ +\n\ +#define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \\\n\ +__kernel void maxpool_##in_name##to##out_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int width, \\\n\ + int height, \\\n\ + int stride_x, \\\n\ + int stride_y, \\\n\ + int pad_x, \\\n\ + int pad_y, \\\n\ + int kernel_dia_x, \\\n\ + int kernel_dia_y, \\\n\ + int dilation_x, \\\n\ + int dilation_y, \\\n\ + float inout_scale, \\\n\ + float inout_tail) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y); \\\n\ + int4 coord_in = coord_out; \\\n\ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y); \\\n\ + \\\n\ + for(; pos_start.x < 0;) \\\n\ + { \\\n\ + pos_start.x += dilation_x; \\\n\ + } \\\n\ + for(; pos_start.y < 0;) \\\n\ + { \\\n\ + pos_start.y += dilation_y; \\\n\ + } \\\n\ + \\\n\ + pos_end = min(pos_end, (int2)(width, height)); \\\n\ + \\\n\ + src_type src0, maxVal; \\\n\ + maxVal.x = max_val; \\\n\ + \\\n\ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y) \\\n\ + { \\\n\ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;) \\\n\ + { \\\n\ + src0 = read_func(input, coord_in); \\\n\ + coord_in.x += dilation_x; \\\n\ + maxVal = max(src0, maxVal); \\\n\ + } \\\n\ + } \\\n\ + \\\n\ + float4 fValTmp; \\\n\ + fValTmp.x = maxVal.x * inout_scale + inout_tail; \\\n\ + dst_type dst = conv_func(fValTmp); \\\n\ + write_func(output, coord_out, dst.xxxx); \\\n\ +}\n\ +MAXPOOL_QINT(U32, U32, uint4, uint4, 0, read_imageui, write_imageui, convert_uint4_rte)\n\ +MAXPOOL_QINT(I32, I32, int4, int4, -2147483648, read_imagei, write_imagei, convert_int4_rte)\n\ +\n\ +__kernel void maxpool_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int width,\n\ + int height,\n\ + int stride_x,\n\ + int stride_y,\n\ + int pad_x,\n\ + int pad_y,\n\ + int kernel_dia_x,\n\ + int kernel_dia_y,\n\ + int dilation_x,\n\ + int dilation_y,\n\ + float inout_scale,\n\ + float inout_tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);\n\ + int4 coord_in = coord_out;\n\ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);\n\ +\n\ + for(; pos_start.x < 0;)\n\ + {\n\ + pos_start.x += dilation_x;\n\ + }\n\ + for(; pos_start.y < 0;)\n\ + {\n\ + pos_start.y += dilation_y;\n\ + }\n\ +\n\ + pos_end = min(pos_end, (int2)(width, height));\n\ +\n\ + float4 src0, maxVal;\n\ + maxVal.x = VSI_FLOAT32_MIN;\n\ +\n\ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)\n\ + {\n\ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)\n\ + {\n\ + src0 = read_imagef(input, coord_in);\n\ + coord_in.x += dilation_x;\n\ + maxVal = max(src0, maxVal);\n\ + }\n\ + }\n\ +\n\ + write_imagef(output, coord_out, maxVal.xxxx);\n\ +}\n\ +\n\ +__kernel void maxpool_U32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int width,\n\ + int height,\n\ + int stride_x,\n\ + int stride_y,\n\ + int pad_x,\n\ + int pad_y,\n\ + int kernel_dia_x,\n\ + int kernel_dia_y,\n\ + int dilation_x,\n\ + int dilation_y,\n\ + float inout_scale,\n\ + float inout_tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);\n\ + int4 coord_in = coord_out;\n\ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);\n\ +\n\ + for(; pos_start.x < 0;)\n\ + {\n\ + pos_start.x += dilation_x;\n\ + }\n\ + for(; pos_start.y < 0;)\n\ + {\n\ + pos_start.y += dilation_y;\n\ + }\n\ +\n\ + pos_end = min(pos_end, (int2)(width, height));\n\ +\n\ + uint4 src0, maxVal;\n\ + maxVal.x = 0;\n\ +\n\ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)\n\ + {\n\ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)\n\ + {\n\ + src0 = read_imageui(input, coord_in);\n\ + coord_in.x += dilation_x;\n\ + maxVal = max(src0, maxVal);\n\ + }\n\ + }\n\ +\n\ + float4 dst;\n\ + dst.x = maxVal.x * inout_scale + inout_tail;\n\ +\n\ + write_imagef(output, coord_out, dst.xxxx);\n\ +}\n\ +\n\ +__kernel void maxpool_F32toU32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int width,\n\ + int height,\n\ + int stride_x,\n\ + int stride_y,\n\ + int pad_x,\n\ + int pad_y,\n\ + int kernel_dia_x,\n\ + int kernel_dia_y,\n\ + int dilation_x,\n\ + int dilation_y,\n\ + float inout_scale,\n\ + float inout_tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\ + int2 pos_start = coord_out.xy * (int2)(stride_x, stride_y) - (int2)(pad_x, pad_y);\n\ + int4 coord_in = coord_out;\n\ + int2 pos_end = pos_start + (int2)(kernel_dia_x, kernel_dia_y);\n\ +\n\ + for(; pos_start.x < 0;)\n\ + {\n\ + pos_start.x += dilation_x;\n\ + }\n\ + for(; pos_start.y < 0;)\n\ + {\n\ + pos_start.y += dilation_y;\n\ + }\n\ +\n\ + pos_end = min(pos_end, (int2)(width, height));\n\ +\n\ + float4 src0, maxVal;\n\ + maxVal.x = VSI_FLOAT32_MIN;\n\ +\n\ + for(coord_in.y = pos_start.y; coord_in.y < pos_end.y; coord_in.y += dilation_y)\n\ + {\n\ + for(coord_in.x = pos_start.x; coord_in.x < pos_end.x;)\n\ + {\n\ + src0 = read_imagef(input, coord_in);\n\ + coord_in.x += dilation_x;\n\ + maxVal = max(src0, maxVal);\n\ + }\n\ + }\n\ +\n\ + uint4 dst;\n\ + dst.x = convert_uint_rte(maxVal.x * inout_scale + inout_tail);\n\ +\n\ + write_imageui(output, coord_out, dst.xxxx);\n\ +}\n\ +"; /* end of maxpool_cl*/ + static const char maxpoolwithargmax_cl[] = "#define FP32_MIN -3.4e38\n\ #define I32_MIN -2147483647\n\ \n\ @@ -63931,6 +67624,69 @@ __kernel void moments_axis01_BF16toF32(\n\ write_imagef(output_vari, coord_out, vari);\n\ }\n\ }\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(8, 8, 1))) void moments_axis12_U8toF32(\n\ + image2d_array_t input, image2d_array_t output_mean, image2d_array_t output_vari,\n\ + int axis, int axis_num, int input_zp, float input_scale,\n\ + int width, int height, int chn, float dimRatio\n\ + )\n\ +{\n\ + int lidx = get_local_id(0);\n\ + int lidy = get_local_id(1);\n\ + int gidz = get_global_id(2); // width\n\ +\n\ + int4 coord = (int4)(gidz, lidx, lidy, 0);\n\ + uint4 data;\n\ + float sum = 0, sqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + __local uint lcl_sumSqr[128];\n\ + __local uint lcl_sumSqr1[32];\n\ +\n\ + uint2 tmpSumSqr = 0;\n\ + for(coord.z = lidy; coord.z < chn; coord.z += 8)\n\ + {\n\ + for(coord.y = lidx; coord.y < height;)\n\ + {\n\ + data = read_imageui(input, coord);\n\ + coord.y += 8;\n\ + tmpSumSqr = tmpSumSqr + (uint2)(data.x, data.x * data.x);\n\ + }\n\ + //sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ + //sum += (tmpSum - height * input_zp) * input_scale;\n\ + }\n\ + int index = lidx + lidy * 8;\n\ + vstore2(tmpSumSqr, index, lcl_sumSqr);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + if(index < 16)\n\ + {\n\ + uint4 val0 = vload4(index, lcl_sumSqr);\n\ + uint4 val1 = vload4(index, lcl_sumSqr + 64);\n\ + val0 += val1;\n\ + uint2 val2 = val0.xy + val0.zw;\n\ + vstore2(val2, index, lcl_sumSqr1);\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + if(index == 0)\n\ + {\n\ + uint4 val0 = 0;\n\ + for(int i = 0; i < 8; i++)\n\ + {\n\ + val0 += vload4(i, lcl_sumSqr1);\n\ + }\n\ +\n\ + float2 tmpVal = convert_float2(val0.xy + val0.zw);\n\ + sum = (tmpVal.x - height * chn * input_zp) * input_scale;\n\ + sqr = (tmpVal.y - 2 * input_zp * tmpVal.x + height * chn * input_zp * input_zp) * e2InScale;\n\ + float4 mean, vari;\n\ + mean.x = sum * dimRatio;\n\ + vari.x = sqr * dimRatio;\n\ + vari.x = vari.x - mean.x * mean.x;\n\ +\n\ + write_imagef(output_mean, coord.xwww, mean);\n\ + write_imagef(output_vari, coord.xwww, vari);\n\ + }\n\ +}\n\ "; /* end of moments_axis01_cl*/ static const char moments_axis012_cl[] = "__kernel void moments_axis012_U8toF32(\n\ @@ -71458,7 +75214,10 @@ static const source_map_t evis_resource[] = {"cumsum_f16_u8_vx", cumsum_f16_u8_vx}, {"custom_softmax_vx", custom_softmax_vx}, {"custom_warp_affine_vx", custom_warp_affine_vx}, + {"custom_warp_affine_2d_vx", custom_warp_affine_2d_vx}, + {"custom_warp_affine_optional_vx", custom_warp_affine_optional_vx}, {"custom_warp_affine_rgb_vx", custom_warp_affine_rgb_vx}, + {"custom_warp_affine_rgb_optional_vx", custom_warp_affine_rgb_optional_vx}, {"custom_warp_perspective_vx", custom_warp_perspective_vx}, {"depth2space_crd_vx", depth2space_crd_vx}, {"depthwise_conv1d_src0_vx", depthwise_conv1d_src0_vx}, @@ -71495,6 +75254,7 @@ static const source_map_t evis_resource[] = {"grucell_activation_sma_vx", grucell_activation_sma_vx}, {"grucell_activation_z_h_vx", grucell_activation_z_h_vx}, {"grucell_cdnn_activation_vx", grucell_cdnn_activation_vx}, + {"grucell_cdnn_activation_bf16_vx", grucell_cdnn_activation_bf16_vx}, {"grucell_cdnn_activation_u8_vx", grucell_cdnn_activation_u8_vx}, {"grucell_h_times_activation_r_vx", grucell_h_times_activation_r_vx}, {"grucell_reset_after_activation_vx", grucell_reset_after_activation_vx}, @@ -71520,24 +75280,36 @@ static const source_map_t evis_resource[] = {"log_softmax_axis2_vx", log_softmax_axis2_vx}, {"logical_not_vx", logical_not_vx}, {"logical_ops_vx", logical_ops_vx}, + {"lstmunit_activation_BP_BF16_vx", lstmunit_activation_BP_BF16_vx}, {"lstmunit_activation_BP_F16_vx", lstmunit_activation_BP_F16_vx}, {"lstmunit_activation_BP_U8_vx", lstmunit_activation_BP_U8_vx}, + {"lstmunit_activation_B_BF16_vx", lstmunit_activation_B_BF16_vx}, {"lstmunit_activation_B_F16_vx", lstmunit_activation_B_F16_vx}, {"lstmunit_activation_B_U8_vx", lstmunit_activation_B_U8_vx}, + {"lstmunit_activation_CBP_BF16_vx", lstmunit_activation_CBP_BF16_vx}, {"lstmunit_activation_CBP_F16_vx", lstmunit_activation_CBP_F16_vx}, {"lstmunit_activation_CBP_U8_vx", lstmunit_activation_CBP_U8_vx}, + {"lstmunit_activation_CB_BF16_vx", lstmunit_activation_CB_BF16_vx}, {"lstmunit_activation_CB_F16_vx", lstmunit_activation_CB_F16_vx}, {"lstmunit_activation_CB_U8_vx", lstmunit_activation_CB_U8_vx}, + {"lstmunit_activation_CLP_BF16_vx", lstmunit_activation_CLP_BF16_vx}, {"lstmunit_activation_CLP_F16_vx", lstmunit_activation_CLP_F16_vx}, + {"lstmunit_activation_CL_BF16_vx", lstmunit_activation_CL_BF16_vx}, {"lstmunit_activation_CL_F16_vx", lstmunit_activation_CL_F16_vx}, + {"lstmunit_activation_CSP_BF16_vx", lstmunit_activation_CSP_BF16_vx}, {"lstmunit_activation_CSP_F16_vx", lstmunit_activation_CSP_F16_vx}, {"lstmunit_activation_CSP_U8_vx", lstmunit_activation_CSP_U8_vx}, + {"lstmunit_activation_CS_BF16_vx", lstmunit_activation_CS_BF16_vx}, {"lstmunit_activation_CS_F16_vx", lstmunit_activation_CS_F16_vx}, {"lstmunit_activation_CS_U8_vx", lstmunit_activation_CS_U8_vx}, + {"lstmunit_activation_LP_BF16_vx", lstmunit_activation_LP_BF16_vx}, {"lstmunit_activation_LP_F16_vx", lstmunit_activation_LP_F16_vx}, + {"lstmunit_activation_L_BF16_vx", lstmunit_activation_L_BF16_vx}, {"lstmunit_activation_L_F16_vx", lstmunit_activation_L_F16_vx}, + {"lstmunit_activation_SP_BF16_vx", lstmunit_activation_SP_BF16_vx}, {"lstmunit_activation_SP_F16_vx", lstmunit_activation_SP_F16_vx}, {"lstmunit_activation_SP_U8_vx", lstmunit_activation_SP_U8_vx}, + {"lstmunit_activation_S_BF16_vx", lstmunit_activation_S_BF16_vx}, {"lstmunit_activation_S_F16_vx", lstmunit_activation_S_F16_vx}, {"lstmunit_activation_S_U8_vx", lstmunit_activation_S_U8_vx}, {"matrixmul_bf16_vx", matrixmul_bf16_vx}, @@ -71561,6 +75333,7 @@ static const source_map_t evis_resource[] = {"matrixmul_u8u8_f16_vx", matrixmul_u8u8_f16_vx}, {"maximum_0_vx", maximum_0_vx}, {"maximum_1_vx", maximum_1_vx}, + {"maxpool_vx", maxpool_vx}, {"minimum_0_vx", minimum_0_vx}, {"minimum_1_vx", minimum_1_vx}, {"mod_vx", mod_vx}, @@ -71764,9 +75537,11 @@ static const source_map_t cl_resource[] = {"lstmunit_activation_S_F32_cl", lstmunit_activation_S_F32_cl}, {"lstmunit_activation_S_U8_cl", lstmunit_activation_S_U8_cl}, {"matrixmul_cl", matrixmul_cl}, + {"matrixmul_4x_cl", matrixmul_4x_cl}, {"matrixmul_cross_cl", matrixmul_cross_cl}, {"matrixmul_transA_cl", matrixmul_transA_cl}, {"maximum_cl", maximum_cl}, + {"maxpool_cl", maxpool_cl}, {"maxpoolwithargmax_cl", maxpoolwithargmax_cl}, {"maxpoolwithargmax_2d_cl", maxpoolwithargmax_2d_cl}, {"maxunpool_cl", maxunpool_cl}, diff --git a/src/tim/vx/internal/src/makefile.linux b/src/tim/vx/internal/src/makefile.linux deleted file mode 100644 index eace064..0000000 --- a/src/tim/vx/internal/src/makefile.linux +++ /dev/null @@ -1,261 +0,0 @@ -# to make ovxlib can compile both IDE and SKD -# if you want to use IDE to compile : export USE_IDE_LIB=1 -# and VIVANTE_SDK_DIR=..../VeriSilicon/VivanteIDE5.4.0/cmdtools/vsimulator - -################################################################################### -#common parts -# OBJECTS. - -OBJECTS = $(OBJ_DIR)/vsi_nn_context.o \ - $(OBJ_DIR)/vsi_nn_client_op.o \ - $(OBJ_DIR)/vsi_nn_graph.o \ - $(OBJ_DIR)/vsi_nn_node_attr_template.o \ - $(OBJ_DIR)/vsi_nn_node.o \ - $(OBJ_DIR)/vsi_nn_ops.o \ - $(OBJ_DIR)/vsi_nn_daemon.o \ - $(OBJ_DIR)/vsi_nn_tensor.o \ - $(OBJ_DIR)/vsi_nn_version.o \ - $(OBJ_DIR)/vsi_nn_rnn.o \ - $(OBJ_DIR)/vsi_nn_rnn_helper.o \ - $(OBJ_DIR)/vsi_nn_internal_node.o \ - $(OBJ_DIR)/vsi_nn_log.o \ - $(OBJ_DIR)/vsi_nn_graph_optimization.o \ - $(OBJ_DIR)/vsi_nn_pre_post_process.o - -vpath %.c utils -OBJECTS += $(OBJ_DIR)/vsi_nn_code_generator.o \ - $(OBJ_DIR)/vsi_nn_binary_tree.o \ - $(OBJ_DIR)/vsi_nn_map.o \ - $(OBJ_DIR)/vsi_nn_link_list.o \ - $(OBJ_DIR)/vsi_nn_math.o \ - $(OBJ_DIR)/vsi_nn_dtype_util.o \ - $(OBJ_DIR)/vsi_nn_shape_util.o \ - $(OBJ_DIR)/vsi_nn_dtype.o \ - $(OBJ_DIR)/vsi_nn_limits.o \ - $(OBJ_DIR)/vsi_nn_util.o \ - $(OBJ_DIR)/vsi_nn_dlfcn.o \ - $(OBJ_DIR)/vsi_nn_constraint_check.o \ - $(OBJ_DIR)/vsi_nn_hashmap.o \ - $(OBJ_DIR)/vsi_nn_tensor_op.o - -vpath %.c quantization -OBJECTS += $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o \ - $(OBJ_DIR)/vsi_nn_asymmetric_affine.o \ - $(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o - -vpath %.c post -OBJECTS += $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \ - $(OBJ_DIR)/vsi_nn_post_cmupose.o - -vpath %.c libnnext -OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \ - $(OBJ_DIR)/vsi_nn_vxkernel.o - -vpath %.c cpu_backend -SRCS += ${notdir ${wildcard cpu_backend/*.c}} - -vpath %.c libnnext/ops/kernel -SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}} - -vpath %.c ops -SRCS += ${notdir ${wildcard ops/*.c}} - -vpath %.c kernel -SRCS += ${notdir ${wildcard kernel/*.c}} - -vpath %.c kernel/cl -SRCS += ${notdir ${wildcard kernel/cl/*.c}} - -vpath %.c kernel/cpu -SRCS += ${notdir ${wildcard kernel/cpu/*.c}} - -vpath %.c kernel/evis -SRCS += ${notdir ${wildcard kernel/evis/*.c}} - -vpath %.c kernel/vx -SRCS += ${notdir ${wildcard kernel/vx/*.c}} - -vpath %.c kernel/sp -SRCS += ${notdir ${wildcard kernel/sp/*.c}} - -vpath %.c custom/ops -SRCS += ${notdir ${wildcard custom/ops/*.c}} - -vpath %.c custom/ops/kernel/evis -SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}} - -vpath %.c custom/ops/kernel/cl -SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}} - -vpath %.c custom/ops/kernel/cpu -SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}} - -vpath %.c custom/ops/kernel/sp -SRCS += ${notdir ${wildcard custom/ops/kernel/sp/*.c}} - -OBJECTS += ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)} - -ifeq ($(USE_VIP_DEVICE),1) -vpath %.cpp vip -OBJECTS += $(OBJ_DIR)/virtual_device.o -endif - -################################################################################ -ifeq ($(USE_IDE_LIB),1) -# IDE. - -CC=$(CROSS_COMPILE)gcc - -INCLUDES=-I. -I$(VIVANTE_SDK_DIR)/include/ \ - -I$(VIVANTE_SDK_DIR)/include/CL \ - -I$(VIVANTE_SDK_DIR)/include/VX \ - -I../include/ops -I../include/utils -I../include/inference \ - -I../include/client -I../include -I../include/libnnext \ - -I../include/cpu_backend \ - -I../src - -ifeq (1,$(DEBUG)) -CFLAGS+=-g -LFLAGS+=-g -else -CFLAGS+=-O3 -LFLAGS+=-O3 -endif -CFLAGS += $(INCLUDES) -CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror -Wno-strict-aliasing -Wno-maybe-uninitialized -CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))' - -LIBS+= -L$(VIVANTE_SDK_DIR)/lib \ - -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -lArchModelSw -lNNArchPerf -LIBS+= -L$(VIVANTE_SDK_DIR)/lib/vsim \ - -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux \ - -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \ - -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \ - -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -LIBS+= -L$(VIVANTE_SDK_DIR)/../common/lib/ \ - -lvdtproxy -LIBS += -lm -ldl - -File = $(VIVANTE_SDK_DIR)/lib/libjpeg.a -File2 = $(VIVANTE_SDK_DIR)/lib/x64_linux/libjpeg.a -File3 = $(VIVANTE_SDK_DIR)/../common/lib/libjpeg.a -ifeq ($(File),$(wildcard $(File))) -LIBS+= $(File) -else ifeq ($(File2),$(wildcard $(File2))) -LIBS+= $(File2) -else -LIBS+= $(File3) -endif - -################################################################################### -# Macros. -CFLAGS += -fPIC -DYNAMIC := 1 -TARGET_NAME = libovxlib.so -OBJ_DIR=bin_r -TARGET_OUTPUT = $(OBJ_DIR)/$(TARGET_NAME) - -all: $(TARGET_OUTPUT) -clean: - @rm -rf $(OBJ_DIR)/* $(OBJ_DIR) - -install: $(TARGET_OUTPUT) - -################################################################################ - -LDFLAGS += -Wall -shared -Wl,-soname,$(TARGET_NAME) -Wl,-z,defs -fPIC - -ifeq ($(USE_VIP_DEVICE),1) -LDFLAGS += -pthread -LIBS += -lstdc++ -INCLUDE += -I../include/vip -$(OBJ_DIR)/virtual_device.o: virtual_device.cpp - @echo " COMPILE $(abspath $<)" - @mkdir -p $(OBJ_DIR) - @$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $< -endif - -$(TARGET_OUTPUT): $(OBJECTS) - @echo " LINK \033[1m$(notdir $@)\033[0m" - @$(CC) $(LDFLAGS) $(OBJECTS) -o $(TARGET_OUTPUT) $(LIBS) - -$(OBJ_DIR)/%.o: %.c - @echo " COMPILE $(abspath $<)" - @mkdir -p $(OBJ_DIR) - @$(CC) -c $(CFLAGS) -o $@ $< - -else -################################################################################## -#SDK. - -# include common definition. -include $(AQROOT)/makefile.linux.def - -################################################################################# -INCLUDE += -I$(VIVANTE_SDK_INC) -I$(VIVANTE_SDK_INC)/HAL -I$(AQROOT)/sdk/inc -INCLUDE += -I../include/ops -I../include/utils -I../include/inference -INCLUDE += -I../include/client -I../include -I../include/libnnext -INCLUDE += -I../include/cpu_backend -INCLUDE += -I../src - -CFLAGS += $(INCLUDE) -CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror -CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))' - -################################################################################ -# Supply necessary libraries. -ifeq ($(USE_VXC_BINARY)$(USE_VSC_LITE),11) -LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC_Lite -lGAL -else -LIBS += -L$(VIVANTE_SDK_LIB) -l OpenVX -l OpenVXU -l CLC -l VSC -lGAL -endif -LIBS += -lm -ldl - -############################################################################# -# Macros. -ifeq ($(gcdSTATIC_LINK), 1) -STATIC=1 -TARGET_NAME = libovxlib.a -else -CFLAGS += -fPIC -DYNAMIC := 1 -TARGET_NAME = libovxlib.so -endif - -ifneq ("$(OVXLIB_CONFIG)", "") - CFLAGS += -D$(OVXLIB_CONFIG) -endif - -ifneq ($(gcdSTATIC_LINK), 1) - ifeq ($(VSI_GPERF_DEBUG), 1) - TCMALLOC_DIR = $(OVXLIB_DIR)/third-party/gperftools - CFLAGS += -I$(TCMALLOC_DIR)/src - CFLAGS += -I$(TCMALLOC_DIR)/src/gperftools - CFLAGS += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free - CFLAGS += -g - LIBS += -L$(TCMALLOC_DIR)/.libs -ltcmalloc - endif -endif -############################################################################# - -# installation directory -INSTALL_DIR := $(VIVANTE_SDK_LIB) - -################################################################################ -# Include the common makefile. - -ifeq ($(USE_VIP_DEVICE),1) -LDFLAGS += -pthread -LIBS += -lstdc++ -INCLUDE += -I../include/vip -$(OBJ_DIR)/virtual_device.o: virtual_device.cpp - @echo " COMPILE $(abspath $<)" - @mkdir -p $(OBJ_DIR) - @$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $< -endif - -include $(AQROOT)/common.target -endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c index d75a10a..dec079c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c @@ -234,6 +234,7 @@ static vsi_bool op_check IO_TYPE(D_F32, D_I16) IO_TYPE(D_F16, D_I32) IO_TYPE(D_I32, D_I32) + IO_TYPE(D_I32, D_I16) IO_TYPE(D_I8|Q_DFP, D_I32) IO_TYPE(D_U8|Q_ASYM, D_I32) IO_TYPE(D_I8|Q_ASYM, D_U8) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c index e3de22f..3f8ff55 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c @@ -299,7 +299,7 @@ static vsi_bool op_setup } ret = vsi_nn_op_common_setup(self, inputs, outputs); - if ( _is_dataconvert_op(self, inputs, outputs) ) + if ( _is_dataconvert_op(self, inputs, outputs) && ret ) { vsi_nn_internal_node_t* curr = NULL; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c index 47b5889..34afc98 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c @@ -128,6 +128,9 @@ static vsi_bool _is_tensorview_support #ifdef VSI_CONCAT_ENHANCE_SUPPORT // Driver support concat optimize in all dimensions. ret = TRUE; + + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(outputs); #else /* If the concat op need to be optimized to tensor view, the memory must be continues. diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c index c82f15f..7584712 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c @@ -24,6 +24,7 @@ #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_platform.h" #include "vsi_nn_graph.h" #include "vsi_nn_node.h" @@ -216,8 +217,11 @@ static vsi_bool op_setup if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt && VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type ) { - vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL ); - inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; + if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) + { + vsi_nn_TransposeTensor(self->graph, inputs[1], perm, 4, NULL); + inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; + } } #ifdef VX_CONVERT_POLICY_WRAP_ENABLE @@ -227,6 +231,8 @@ static vsi_bool op_setup } #endif + ((vsi_nn_tensor_prv_t*)inputs[1])->processed = TRUE; + nn_param = &self->nn_param.conv2d; vsi_nn_compute_padding( diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index 6d109f0..bfeeab2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -248,6 +248,7 @@ static vsi_bool op_check IO_TYPE(D_BOOL8, D_I16|Q_DFP) IO_TYPE(D_BOOL8, D_I16|Q_ASYM) IO_TYPE(D_BOOL8, D_I16|Q_SYM) + IO_TYPE(D_BOOL8, D_F16) IO_TYPE(D_BOOL8, D_I32) IO_TYPE(D_BOOL8, D_U16) IO_TYPE(D_BOOL8, D_U32) @@ -258,6 +259,7 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_BOOL8) IO_TYPE(D_I16|Q_ASYM, D_BOOL8) IO_TYPE(D_I16|Q_SYM, D_BOOL8) + IO_TYPE(D_F16, D_BOOL8) IO_TYPE(D_I32, D_BOOL8) IO_TYPE(D_U16, D_BOOL8) IO_TYPE(D_U32, D_BOOL8) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c index be301ea..ce79a92 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c @@ -25,6 +25,7 @@ #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_platform.h" #include "vsi_nn_graph.h" #include "vsi_nn_node.h" @@ -410,8 +411,11 @@ static vsi_bool op_setup * */ if( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt ) { - vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL ); - inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; + if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) + { + vsi_nn_TransposeTensor(self->graph, inputs[1], perm, 4, NULL); + inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; + } } #ifdef VX_CONVERT_POLICY_WRAP_ENABLE @@ -424,22 +428,30 @@ static vsi_bool op_setup #ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && TRUE == inputs[1]->attr.is_const) { - /* whnc->whcn */ - vsi_nn_PermuteTensor( self->graph, inputs[1], perm1, 4 ); + if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) { + /* whnc->whcn */ + vsi_nn_PermuteTensor(self->graph, inputs[1], perm1, 4); + } } /* Rotate 180 degrees for weights data */ if (TRUE == inputs[1]->attr.is_const) { - vsi_nn_reshuffle_weight_data(self->graph, inputs[1]); + if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) { + vsi_nn_reshuffle_weight_data(self->graph, inputs[1]); + } } #else if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) >= 0 && TRUE == inputs[1]->attr.is_const) { /* whcn->whnc */ - vsi_nn_PermuteTensor( self->graph, inputs[1], perm1, 4 ); + if (!((vsi_nn_tensor_prv_t*)inputs[1])->processed) { + vsi_nn_PermuteTensor(self->graph, inputs[1], perm1, 4); + } } #endif + ((vsi_nn_tensor_prv_t*)inputs[1])->processed = TRUE; + nn_param = &self->nn_param.deconv; nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c index 489d3cb..be32f48 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -50,36 +50,12 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; - uint32_t i = 0; - vsi_size_t block_size = 1, block_num = 1, axis_num = 0, indices_num = 1; int32_t axis = self->nn_param.gather.axis; int32_t batch_dims = self->nn_param.gather.batch_dims; - vsi_size_t *input_size = inputs[0]->attr.size; - uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num; - uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num; param = vsi_nn_kernel_param_create(); - for (i = 0; i < (uint32_t)axis; ++i) - { - block_size *= input_size[i]; - } - - axis_num = input_size[axis]; - for (i = axis + 1; i < r_rank - batch_dims; ++i) - { - block_num *= input_size[i]; - } - for (i = 0; i < q_rank - batch_dims; ++i) - { - indices_num *= inputs[1]->attr.size[i]; - } - - vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size ); - vsi_nn_kernel_param_add_int32( param, "block_num", (int32_t)block_num ); - vsi_nn_kernel_param_add_int32( param, "axis_num", (int32_t)axis_num ); vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis ); - vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num ); vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims ); if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE || diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c index 2fc49d0..da15699 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c @@ -234,6 +234,10 @@ static vsi_bool op_setup_default { attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; } + else if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16) + { + attr.dtype.vx_type = VSI_NN_TYPE_BFLOAT16; + } else { attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c index f715c99..4bf4443 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c @@ -374,6 +374,17 @@ static vsi_bool op_setup } } + for ( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) + { + if (inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i] != NULL + && p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i].qnt_type == VSI_NN_QNT_TYPE_NONE + && p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i].vx_type == VSI_NN_TYPE_NONE + && inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16) + { + p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i] = inputs[LSTMUNIT_INPUT_WEIGHT_I2I + i]->attr.dtype; + } + } + /* Input FC */ if( is_input_fc_on_tp ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c index f4005a8..760c88a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c @@ -54,21 +54,12 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t *param = NULL; vsi_nn_kernel_node_t n = NULL; - vsi_nn_tensor_t * tmp_inputs[2] = {NULL}; - vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; - uint32_t new_rank[3] = {0}; - vsi_bool ret = FALSE; - vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t transposeA = self->nn_param.matrixmul.transpose[0]; int32_t transposeB = self->nn_param.matrixmul.transpose[1]; int32_t adjointA = self->nn_param.matrixmul.adjoint[0]; int32_t adjointB = self->nn_param.matrixmul.adjoint[1]; - uint32_t cross_flg = 0; - uint32_t size_axis_inner_outer[3] = {0}; - uint32_t stride_axis_inner_outer[9] = {0}; - param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_int32( param, "transposeA", transposeA ); @@ -76,52 +67,18 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA ); vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB ); - - ret = vsi_nn_kernel_optimize_matrixmul_broadcast_shape( - inputs[0]->attr.size, - inputs[1]->attr.size, - outputs[0]->attr.size, - inputs[0]->attr.dim_num, - inputs[1]->attr.dim_num, - outputs[0]->attr.dim_num, - shapes[0], shapes[1], shapes[2], new_rank, - &cross_flg, size_axis_inner_outer, stride_axis_inner_outer); - - if (ret) - { - vsi_nn_kernel_param_add_int32( param, "cross_flg", cross_flg ); - vsi_nn_kernel_param_add_buffer( param, "size_axis_inner_outer", size_axis_inner_outer, 3); - vsi_nn_kernel_param_add_buffer( param, "stride_axis_inner_outer", stride_axis_inner_outer, 9); - - tmp_inputs[0] = vsi_nn_reshape_tensor(self->graph, inputs[0], shapes[0], new_rank[0]); - tmp_inputs[1] = vsi_nn_reshape_tensor(self->graph, inputs[1], shapes[1], new_rank[1]); - tmp_outputs[0] = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes[2], new_rank[2]); - } - else - { - VSILOGE("illegal inputs shape"); - status = VSI_FAILURE; - goto final; - } - - - n = vsi_nn_kernel_selector( self->graph, "matrixmul", tmp_inputs, 2, tmp_outputs, 1, param ); + n = vsi_nn_kernel_selector( self->graph, "matrixmul", inputs, 2, outputs, 1, param ); if ( n != NULL ) { self->n = (vx_node)n; status = VSI_SUCCESS; } -final: if (param != NULL) { vsi_nn_kernel_param_release( ¶m ); } - vsi_safe_release_tensor( tmp_inputs[0] ); - vsi_safe_release_tensor( tmp_inputs[1] ); - vsi_safe_release_tensor( tmp_outputs[0] ); - return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c index a94df55..b7102cf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c @@ -74,6 +74,20 @@ static vsi_bool op_check return ret; } /* op_check() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.max_pool3d.dilation[0] = 1; + self->nn_param.max_pool3d.dilation[1] = 1; + self->nn_param.max_pool3d.dilation[2] = 1; + + return status; +} /* op_init() */ + static vsi_status op_optimize ( vsi_nn_node_t * self, @@ -120,7 +134,7 @@ static vsi_bool op_setup inputs[0]->attr.size, ksize, p->stride, - NULL, + p->dilation, p->pad_type, pad ); @@ -142,7 +156,7 @@ static vsi_bool op_setup p->ksize[0], &p->pad[0], p->stride[0], - 0, + p->dilation[0], p->round_type ); @@ -152,7 +166,7 @@ static vsi_bool op_setup p->ksize[1], &p->pad[2], p->stride[1], - 0, + p->dilation[1], p->round_type ); @@ -162,7 +176,7 @@ static vsi_bool op_setup p->ksize[2], &p->pad[4], p->stride[2], - 0, + p->dilation[2], p->round_type ); @@ -210,6 +224,8 @@ static vsi_bool op_setup curr->node->nn_param.pool.pad[1] = p->pad[1]; curr->node->nn_param.pool.pad[2] = p->pad[2]; curr->node->nn_param.pool.pad[3] = p->pad[3]; + curr->node->nn_param.pool.dilation[0] = p->dilation[0]; + curr->node->nn_param.pool.dilation[1] = p->dilation[1]; curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX; curr->node->nn_param.pool.round_type = p->round_type; curr->node->nn_param.pool.pad_type = p->pad_type; @@ -265,6 +281,8 @@ static vsi_bool op_setup curr->node->nn_param.pool.pad[1] = 0; curr->node->nn_param.pool.pad[2] = p->pad[4]; curr->node->nn_param.pool.pad[3] = p->pad[5]; + curr->node->nn_param.pool.dilation[0] = 1; + curr->node->nn_param.pool.dilation[1] = p->dilation[2]; curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX; curr->node->nn_param.pool.round_type = p->round_type; curr->node->nn_param.pool.pad_type = p->pad_type; @@ -305,7 +323,7 @@ __BEGIN_DECLS DEF_OP_REG ( /* op_name */ MAX_POOL3D, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c index a70c3f7..4b5b01a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c @@ -37,29 +37,25 @@ #define _INPUT_NUM 1 #define _OUTPUT_NUM 2 -static void _squeeze_axis +static vsi_bool _is_continue_axis ( - vsi_nn_tensor_t *input, const int32_t* axis_in, - int32_t axis_num, - int32_t* axis_out, - int32_t *axis_num_out + int32_t axis_num ) { int32_t i = 0; + vsi_bool is_continue_axis = TRUE; - memcpy(axis_out, axis_in, sizeof(int32_t) * axis_num); - *axis_num_out = axis_num; - - for (i = 0; i < axis_num; i++) + for ( i = 1; i < axis_num; i++) { - if (axis_in[i] == 3 && input->attr.size[3] == 1) + if ( axis_in[i] != (axis_in[i - 1] + 1) && axis_in[0] == 0) { - *axis_num_out = axis_num - 1; - axis_out[i] = 0; + is_continue_axis = FALSE; break; } } + + return is_continue_axis; } static vsi_status op_compute @@ -72,16 +68,18 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; - int32_t axes_copy[VSI_NN_MAX_DIM_NUM] = { 0 }; - const int32_t* axis = self->nn_param.moments.axis; + int32_t* axis = (int32_t* )self->nn_param.moments.axis; int32_t axis_num = self->nn_param.moments.axis_num; int32_t keep_dim = self->nn_param.moments.keep_dim ? 1 : 0; - _squeeze_axis(inputs[0], axis, axis_num, axes_copy, &axis_num); + if (self->nn_param.moments.lcl_data->use_internal_node) + { + return vsi_nn_internal_compute_node( self ); + } param = vsi_nn_kernel_param_create(); - vsi_nn_kernel_param_add_buffer( param, "axis", axes_copy, axis_num); + vsi_nn_kernel_param_add_buffer( param, "axis", axis, axis_num); vsi_nn_kernel_param_add_int32( param, "keep_dim", keep_dim); n = vsi_nn_kernel_selector( self->graph, "moments", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); if (n != NULL) @@ -105,10 +103,6 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - int32_t axes_copy[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t axes_num = 0; - int32_t i = 0; - BEGIN_IO_TYPE_DECL(MOMENTS, 1, 2) IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32) @@ -140,18 +134,6 @@ static vsi_bool op_check return FALSE; } - _squeeze_axis(inputs[0], self->nn_param.moments.axis, - self->nn_param.moments.axis_num, axes_copy, &axes_num); - - for (i = 0; i < axes_num; i++) - { - if (axes_copy[i] > 2) - { - VSILOGE("moments shader path not support axis: %d", axes_copy[i]); - return FALSE; - } - } - return TRUE; } /* op_check() */ @@ -165,12 +147,16 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ int32_t i = 0, j = 0; vsi_nn_moments_param * p = NULL; + vsi_bool is_continue_axis = FALSE; + vsi_bool ret = TRUE; + + p = &(self->nn_param.moments); if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) { const int32_t* axis = NULL; int32_t axis_num = 0; - p = &(self->nn_param.moments); + axis = p->axis; axis_num = p->axis_num; @@ -216,7 +202,135 @@ static vsi_bool op_setup } } } - return TRUE; + + is_continue_axis = _is_continue_axis(p->axis, p->axis_num); + + if (is_continue_axis == FALSE) + { + vsi_nn_tensor_attr_t attr; + int32_t index = p->axis_num; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* trans_tensor = NULL; + vsi_nn_internal_tensor_t* output0_tensor = NULL; + vsi_nn_internal_tensor_t* output1_tensor = NULL; + + self->nn_param.moments.lcl_data->use_internal_node = TRUE; + + vsi_nn_internal_init_node_wksp( self ); + + memcpy( &attr, &inputs[0]->attr, sizeof( attr ) ); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + trans_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + if (trans_tensor == NULL) + { + VSILOGD("CHECK POINTER Create internal tensor failed"); + ret = FALSE; + goto final; + } + + memcpy( &attr, &outputs[0]->attr, sizeof( attr ) ); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + output0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + if (output0_tensor == NULL) + { + VSILOGD("CHECK POINTER Create internal tensor failed"); + ret = FALSE; + goto final; + } + + memcpy( &attr, &outputs[1]->attr, sizeof( attr ) ); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + output1_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + if (output1_tensor == NULL) + { + VSILOGD("CHECK POINTER Create internal tensor failed"); + ret = FALSE; + goto final; + } + + memcpy(p->lcl_data->perm, p->axis, p->axis_num * sizeof(p->axis[0])); + + for ( i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++ ) + { + p->lcl_data->axis[i] = i; + + for ( j = 0; j < p->axis_num; j++ ) + { + if (i == p->axis[j]) + { + break; + } + } + + if (j == p->axis_num) + { + p->lcl_data->perm[index ++] = i; + } + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + if (curr == NULL) + { + VSILOGD("CHECK POINTER Create internal node failed"); + ret = FALSE; + goto final; + } + curr->node->nn_param.permute.perm = p->lcl_data->perm; + curr->node->nn_param.permute.dim_num = inputs[0]->attr.dim_num; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = trans_tensor->t; + vsi_nn_internal_setup_node(self, curr); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_MOMENTS, 0, 0 ); + if (curr == NULL) + { + VSILOGD("CHECK POINTER Create internal node failed"); + ret = FALSE; + goto final; + } + curr->node->nn_param.moments.axis = p->lcl_data->axis; + curr->node->nn_param.moments.axis_num = p->axis_num; + curr->node->nn_param.moments.keep_dim = p->keep_dim; + curr->inputs[0] = trans_tensor->t; + curr->outputs[0] = output0_tensor->t; + curr->outputs[1] = output1_tensor->t; + vsi_nn_internal_setup_node(self, curr); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + if (curr == NULL) + { + VSILOGD("CHECK POINTER Create internal node failed"); + ret = FALSE; + goto final; + } + curr->node->nn_param.reshape2.size = outputs[0]->attr.size; + curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; + curr->inputs[0] = output0_tensor->t; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + if (curr == NULL) + { + VSILOGD("CHECK POINTER Create internal node failed"); + ret = FALSE; + goto final; + } + curr->node->nn_param.reshape2.size = outputs[1]->attr.size; + curr->node->nn_param.reshape2.dim_num = outputs[1]->attr.dim_num; + curr->inputs[0] = output1_tensor->t; + curr->outputs[0] = outputs[1]; + vsi_nn_internal_setup_node(self, curr); + } + +final: + return ret; } /* op_setup() */ static vsi_status op_deinit @@ -224,11 +338,60 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { + vsi_nn_moments_param * p = NULL; + + p = &(self->nn_param.moments); + + vsi_nn_safe_free(p->lcl_data); + + vsi_nn_internal_deinit_node_wksp( self ); + vsi_nn_op_common_deinit(self); return VSI_SUCCESS; } /* op_deinit() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_moments_param * p = NULL; + + p = &(self->nn_param.moments); + + p->lcl_data = + (vsi_nn_moments_lcl_data *)malloc(sizeof(vsi_nn_moments_lcl_data)); + if (NULL == p->lcl_data) + { + return VX_ERROR_NO_MEMORY; + } + memset(p->lcl_data, 0, sizeof(vsi_nn_moments_lcl_data)); + + return status; +} + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + if (self->nn_param.moments.lcl_data->use_internal_node == FALSE) + { + return VSI_SUCCESS; + } + else + { + return vsi_nn_internal_optimize_node( self, direction ); + } +} + #ifdef __cplusplus extern "C" { #endif @@ -236,12 +399,12 @@ extern "C" { DEF_OP_REG ( /* op_name */ MOMENTS, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, /* setup */ op_setup, - /* optimize */ NULL, + /* optimize */ op_optimize, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c index 39dda24..04b502d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c @@ -59,61 +59,50 @@ static void _set_io_index vsi_nn_tensor_t** outputs ) { - uint32_t idx, i, j; - - idx = 0; - for (i = 0; i < self->input.num; i++) + uint32_t i; + uint32_t input_idx = 0; + uint32_t output_idx = 0; + uint32_t numParams; + vxQueryNode(self->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams)); + for ( i = 0; in, idx++, (vx_reference)inputs[i]->t); - scalar_index = idx; - param = vxGetParameterByIndex(self->n, scalar_index); - - if (param) + param = vxGetParameterByIndex(self->n, i); + if(param) { + vx_enum type = 0; + vx_enum direction = 0; vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + vxQueryParameter(param, VX_PARAMETER_DIRECTION, &direction, sizeof(vx_enum)); + if (type == VX_TYPE_TENSOR && direction == VX_INPUT) + { + vxSetParameterByIndex(self->n, i, (vx_reference)inputs[input_idx++]->t); + } + else if(type == VX_TYPE_SCALAR && direction == VX_INPUT) + { + vx_reference ref = 0; + vsi_status status; + vx_enum data_type = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference)); + status = vxQueryScalar((vx_scalar)ref, + VX_SCALAR_TYPE, + &data_type, + sizeof(vx_enum)); + if (status == VX_ERROR_INVALID_REFERENCE) + { + vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0); + ref = (vx_reference)scalar; + vxSetParameterByIndex(self->n, i, ref); + vxReleaseReference(&ref); + } + } + else //output + { + vxSetParameterByIndex(self->n, i, (vx_reference)outputs[output_idx++]->t); + } vxReleaseParameter(¶m); param = NULL; } - - if (type != VX_TYPE_SCALAR) - { - continue; - } - else - { - - /* 4 crop scalar parameters input */ - for (j = scalar_index; j < scalar_index + 4; j++) - { - vx_enum data_type = 0; - vx_reference ref = 0; - vsi_status status; - param = vxGetParameterByIndex(self->n, j); - - if (param) - { - vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference)); - status = vxQueryScalar((vx_scalar)ref, VX_SCALAR_TYPE, &data_type, sizeof(vx_enum)); - if (status == VX_ERROR_INVALID_REFERENCE) - { - vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0); - ref = (vx_reference)scalar; - vxSetParameterByIndex(self->n, idx++, ref); - vxReleaseReference(&ref); - } - vxReleaseParameter(¶m); - param = NULL; - } - } - } - } - for (i = 0; i < self->output.num; i++) - { - vxSetParameterByIndex(self->n, idx++, (vx_reference)outputs[i]->t); } } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c index 38409d6..8b98742 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c @@ -34,6 +34,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" static vsi_bool _is_pool1d @@ -112,13 +113,42 @@ static vsi_status op_compute params.stride_y = self->nn_param.pool.stride[1]; } - self->n = vxPoolingLayer2( - self->graph->g, - tmp_inputs[0]->t, - (vx_nn_pooling_params_t *)¶ms, - sizeof( params ), - tmp_outputs[0]->t - ); + if (self->nn_param.pool.type == VX_NN_POOLING_MAX + && (self->nn_param.pool.dilation[0] > 1 || self->nn_param.pool.dilation[1] > 1)) + { + vsi_nn_kernel_param_t * tmpParam = NULL; + int32_t pool_type = 0; + tmpParam = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( tmpParam, "pool_type", pool_type ); + vsi_nn_kernel_param_add_int32( tmpParam, "pool_size_x", params.base.pool_size_x ); + vsi_nn_kernel_param_add_int32( tmpParam, "pool_size_y", params.base.pool_size_y ); + vsi_nn_kernel_param_add_int32( tmpParam, "pool_pad_x_left", params.base.pool_pad_x_left ); + vsi_nn_kernel_param_add_int32( tmpParam, "pool_pad_x_right", params.base.pool_pad_x_right ); + vsi_nn_kernel_param_add_int32( tmpParam, "pool_pad_y_top", params.base.pool_pad_y_top ); + vsi_nn_kernel_param_add_int32( tmpParam, "pool_pad_y_bottom", params.base.pool_pad_y_bottom ); + vsi_nn_kernel_param_add_int32( tmpParam, "stride_x", params.stride_x ); + vsi_nn_kernel_param_add_int32( tmpParam, "stride_y", params.stride_y ); + vsi_nn_kernel_param_add_int32( tmpParam, "dilation_x", self->nn_param.pool.dilation[0] ); + vsi_nn_kernel_param_add_int32( tmpParam, "dilation_y", self->nn_param.pool.dilation[1] ); + + self->n = vsi_nn_kernel_selector( self->graph, "pool", tmp_inputs, 1, tmp_outputs, 1, tmpParam ); + + if (tmpParam != NULL) + { + vsi_nn_kernel_param_release( &tmpParam ); + } + } + else + { + self->n = vxPoolingLayer2( + self->graph->g, + tmp_inputs[0]->t, + (vx_nn_pooling_params_t *)¶ms, + sizeof( params ), + tmp_outputs[0]->t + ); + } if ( NULL != self->n ) { @@ -269,6 +299,9 @@ static vsi_status op_init self->nn_param.pool.local->reshaped_input = NULL; self->nn_param.pool.local->reshaped_output = NULL; + self->nn_param.pool.dilation[0] = 1; + self->nn_param.pool.dilation[1] = 1; + return status; } /* op_init() */ @@ -315,7 +348,7 @@ static vsi_bool op_setup inputs[0]->attr.size, ksize, self->nn_param.pool.stride, - NULL, + self->nn_param.pool.dilation, self->nn_param.pool.pad_type, pad ); @@ -335,7 +368,7 @@ static vsi_bool op_setup self->nn_param.pool.ksize[0], &self->nn_param.pool.pad[0], self->nn_param.pool.stride[0], - 0, + self->nn_param.pool.dilation[0], self->nn_param.pool.round_type ); @@ -348,7 +381,7 @@ static vsi_bool op_setup inputs[0]->attr.size, ksize, self->nn_param.pool.stride, - NULL, + self->nn_param.pool.dilation, self->nn_param.pool.pad_type, pad ); @@ -368,7 +401,7 @@ static vsi_bool op_setup self->nn_param.pool.ksize[0], &self->nn_param.pool.pad[0], self->nn_param.pool.stride[0], - 0, + self->nn_param.pool.dilation[0], self->nn_param.pool.round_type ); @@ -378,7 +411,7 @@ static vsi_bool op_setup self->nn_param.pool.ksize[1], &self->nn_param.pool.pad[2], self->nn_param.pool.stride[1], - 0, + self->nn_param.pool.dilation[1], self->nn_param.pool.round_type ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index f977e32..dea1770 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -115,6 +115,13 @@ static vsi_bool op_setup vsi_nn_internal_init_node_wksp( self ); + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + p->norm2.scale[0] = p->norm.scale; + p->norm2.scale[1] = p->norm.scale; + p->norm2.scale[2] = p->norm.scale; + } + if ( p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || @@ -196,7 +203,7 @@ static vsi_bool op_setup CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.pre_process_gray.mean = p->norm.mean[0]; - curr->node->nn_param.pre_process_gray.scale = p->norm.scale; + curr->node->nn_param.pre_process_gray.scale = p->norm2.scale[0]; curr->node->nn_param.pre_process_gray.rect.left = p->rect.left; curr->node->nn_param.pre_process_gray.rect.top = p->rect.top; curr->node->nn_param.pre_process_gray.rect.width = p->rect.width; @@ -274,36 +281,18 @@ static vsi_bool op_setup curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[0]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_yuv420.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv420.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv420.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[2]; - curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[0]; - } + curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[0]; } else { curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[2]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_yuv420.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv420.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv420.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[0]; - curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[2]; - } + curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[2]; } curr->node->nn_param.pre_process_yuv420.reverse_channel = p->reverse_channel; @@ -341,36 +330,18 @@ static vsi_bool op_setup curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[0]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_bgra.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_bgra.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_bgra.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[2]; - curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[0]; - } + curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[0]; } else { curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[2]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_bgra.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_bgra.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_bgra.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[0]; - curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[2]; - } + curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[2]; } curr->node->nn_param.pre_process_bgra.reverse_channel = p->reverse_channel; @@ -440,33 +411,15 @@ static vsi_bool op_setup if (p->reverse_channel) { - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[2]; - curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[0]; - } + curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[0]; } else { - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[0]; - curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[2]; - } + curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[2]; } curr->node->nn_param.pre_process_rgb888_planar.r_mean = mean[0]; @@ -493,36 +446,18 @@ static vsi_bool op_setup curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[0]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_yuv444.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv444.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv444.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[2]; - curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[0]; - } + curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[0]; } else { curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[2]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_yuv444.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv444.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv444.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[0]; - curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[2]; - } + curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[2]; } curr->node->nn_param.pre_process_yuv444.reverse_channel = p->reverse_channel; @@ -561,36 +496,18 @@ static vsi_bool op_setup curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[0]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_nv12.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_nv12.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_nv12.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[2]; - curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[0]; - } + curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[0]; } else { curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_nv12.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_nv12.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_nv12.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[0]; - curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[2]; - } + curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[2]; } if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12) @@ -637,36 +554,18 @@ static vsi_bool op_setup curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[0]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_yuv422.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv422.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv422.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[2]; - curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[0]; - } + curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[0]; } else { curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[2]; - if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) - { - curr->node->nn_param.pre_process_yuv422.r_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv422.g_scale = p->norm.scale; - curr->node->nn_param.pre_process_yuv422.b_scale = p->norm.scale; - } - else - { - curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[0]; - curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1]; - curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[2]; - } + curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[2]; } if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422) @@ -781,6 +680,7 @@ static vsi_status op_init { return VX_ERROR_NO_MEMORY; } + memset(self->nn_param.pre_process.local, 0, sizeof(vsi_nn_pre_process_lcl_data)); return status; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index dcbb75b..aa2b231 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -37,6 +37,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" #include "vsi_nn_error.h" +#include "vsi_nn_tensor_util_prv.h" #define _ARG_NUM (6) #define _INPUT_NUM (1) @@ -175,6 +176,11 @@ static vsi_bool _check_is_sp_supported_type return FALSE; } + if (vsi_nn_is_stream_process_supported_types(self->graph, &input, 1) == FALSE) + { + return FALSE; + } + if ( (VSI_NN_TYPE_FLOAT64 == input->attr.dtype.vx_type) || (VSI_NN_TYPE_UINT32 == input->attr.dtype.vx_type) || (VSI_NN_TYPE_UINT64 == input->attr.dtype.vx_type) || @@ -850,7 +856,10 @@ static vsi_bool op_set_sp_reduce_internal vsi_nn_internal_init_node_wksp( self ); memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); + memcpy(&attr.dtype, &inputs[0]->attr.dtype, sizeof(vsi_nn_dtype_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c new file mode 100644 index 0000000..3b92359 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c @@ -0,0 +1,183 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_error.h" + +typedef struct _reducel2_local_data_t { + int32_t placeholder; +} reducel2_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + return vsi_nn_internal_compute_node( self ); + +} /* op_compute() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + return vsi_nn_internal_optimize_node( self, direction ); +} + + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* square_tensor = NULL; + vsi_nn_internal_tensor_t* reducesum_tensor = NULL; + vsi_nn_internal_node_t* square_node = NULL; + vsi_nn_internal_node_t* reducesum_node = NULL; + vsi_nn_internal_node_t* sqrt_node = NULL; + vsi_nn_kernel_dtype_e in_dtype; + + vsi_nn_reducel2_param * p0 = &self->nn_param.reducel2; + + vsi_nn_internal_init_node_wksp(self); + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.is_const = FALSE; + attr.vtl = TRUE; + if (in_dtype == U8 || in_dtype == I8) + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } + else + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } + + square_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(square_tensor, "Create internal tensor failed", final); + square_node = vsi_nn_internal_new_node( self, VSI_NN_OP_SQUARE, 0, 0); + CHECK_PTR_FAIL_GOTO(square_node, "Create internal node failed", final); + + square_node->inputs[0] = inputs[0]; + square_node->outputs[0] = square_tensor->t; + vsi_nn_internal_setup_node( self, square_node ); + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + + reducesum_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(square_tensor, "Create internal tensor failed", final); + reducesum_node = vsi_nn_internal_new_node( self, VSI_NN_OP_REDUCE, 0, 0); + CHECK_PTR_FAIL_GOTO(reducesum_node, "Create internal node failed", final); + + reducesum_node->node->nn_param.reduce.type = VSI_NN_REDUCE_SUM; + reducesum_node->node->nn_param.reduce.axis = p0->axis; + reducesum_node->node->nn_param.reduce.axis_num = p0->axis_num; + reducesum_node->node->nn_param.reduce.keep_dim = p0->keep_dim; + + reducesum_node->inputs[0] = square_tensor->t; + reducesum_node->outputs[0] = reducesum_tensor->t; + vsi_nn_internal_setup_node( self, reducesum_node ); + + sqrt_node = vsi_nn_internal_new_node( self, VSI_NN_OP_SQRT, 0, 0); + CHECK_PTR_FAIL_GOTO(sqrt_node, "Create internal node failed", final); + + sqrt_node->inputs[0] = reducesum_tensor->t; + sqrt_node->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node( self, sqrt_node ); + + /* TODO: Add code to comput outputs' shape. */ + return TRUE; +final: + return FALSE; +} /* op_setup() */ + + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ REDUCEL2, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c index 9efd8fc..305deb8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c @@ -109,6 +109,8 @@ static vsi_bool op_check IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8) IO_TYPE(D_F32, D_F32, D_BOOL8) IO_TYPE(D_I32, D_I32, D_BOOL8) + IO_TYPE(D_I16, D_I32, D_BOOL8) + IO_TYPE(D_I32, D_I16, D_BOOL8) IO_TYPE(D_F16, D_F16, D_I8) IO_TYPE(D_F16, D_I16|Q_DFP, D_I8) @@ -136,6 +138,8 @@ static vsi_bool op_check IO_TYPE(D_BOOL8, D_BOOL8, D_I8) IO_TYPE(D_F32, D_F32, D_I8) IO_TYPE(D_I32, D_I32, D_I8) + IO_TYPE(D_I16, D_I32, D_I8) + IO_TYPE(D_I32, D_I16, D_I8) END_IO_TYPE_DECL(RELATIONAL_OPS) if (!VALIDATE_OP_IO_TYPES(RELATIONAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c index 4c36426..1baa420 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c @@ -118,6 +118,16 @@ static vsi_bool op_check IO_TYPE(D_F16, D_BF16) IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F32, D_I16|Q_DFP) + IO_TYPE(D_F32, D_I16|Q_ASYM) + IO_TYPE(D_F32, D_I16|Q_SYM) + IO_TYPE(D_F32, D_I8|Q_DFP) + IO_TYPE(D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F32, D_I8|Q_SYM) + IO_TYPE(D_F32, D_I32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_I32, D_F32) + /* HW 9.1.1 */ IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index ae43c05..d6e6e90 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -315,7 +315,9 @@ static vsi_status op_compute { vsi_size_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; uint32_t dims = inputs[0]->attr.dim_num; - int32_t shrink_axis_mask = params->shrink_axis_mask; + int32_t shrink_axis_mask = params->shrink_axis_mask; + int32_t new_axis_mask = params->new_axis_mask; + int32_t num_add_axis = params->num_add_axis; memset(¶m, 0, sizeof(vx_nn_stride_slice_params_t)); @@ -384,20 +386,26 @@ static vsi_status op_compute memset(&sizes, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); memcpy(&sizes, &outputs[0]->attr.size, sizeof(int32_t) * outputs[0]->attr.dim_num); - if (shrink_axis_mask && p->shrink_axis_mask == 0) + if ((shrink_axis_mask && p->shrink_axis_mask == 0) || + new_axis_mask) { uint32_t i = 0; - uint32_t j = 0; + uint32_t j = 0, idx = 0; - for (i = 0; i < inputs[0]->attr.dim_num; i++) + for (i = 0; i < inputs[0]->attr.dim_num + num_add_axis; i++) { - if (shrink_axis_mask & (1 << i)) + if ( new_axis_mask & (1 << i) ) { - sizes[i] = 1; + j ++; + continue; + } + else if (shrink_axis_mask & (1 << i)) + { + sizes[idx ++] = 1; } else { - sizes[i] = outputs[0]->attr.size[j ++]; + sizes[idx ++] = outputs[0]->attr.size[j ++]; } } } @@ -588,6 +596,7 @@ static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_para } } + params->new_axis_mask = new_axis_mask; new_axis_mask = _reverse_mask_bits(new_axis_mask, output_dims); params->num_add_axis = num_add_axis; @@ -789,7 +798,7 @@ static vsi_status op_optimize (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, sizeof(inputs[0]->attr.size[0]) ); } - else + else if (inputs[0]->attr.dim_num == outputs[0]->attr.dim_num) { if ( NULL == inputs[0]->t ) { @@ -821,6 +830,10 @@ static vsi_status op_optimize outputs[0]->t = in_view_tensor; } } + else + { + self->nn_param.strided_slice.lcl2_data->is_optimized = FALSE; + } OnError: return status; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c index b6fb26e..6291e5c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c @@ -38,7 +38,7 @@ #include "utils/vsi_nn_constraint_check.h" /* - Declare number of input and output. + Declare number of input and output . */ static vsi_bool _is_supported_axis(vsi_size_t* multiples, vsi_size_t multiples_num) @@ -77,14 +77,16 @@ static vsi_status _tile_op_compute vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_size_t new_rank = 0; vsi_bool ret = FALSE; + uint32_t i = 0; vsi_size_t* multiples = (vsi_size_t*)self->nn_param.tile.multiples; - vsi_nn_tensor_t* temp_tensors[2] = { NULL }; - vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_nn_tensor_t* temp_tensors[3] = { NULL }; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + int32_t multiples_value[VSI_NN_MAX_DIM_NUM] = {0}; vsi_nn_tensor_attr_t attr; if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE) { - VSILOGW("tile is no_range_change operation! \ + VSILOGW(" tile is no_range_change operation! \ Insert DataConvert Operation when the quantization parameters\ of input and output are inconsistent!"); @@ -92,17 +94,17 @@ static vsi_status _tile_op_compute memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype)); attr.is_const = FALSE; attr.vtl = TRUE; - temp_tensors[1] = vsi_nn_CreateTensor( self->graph, &attr ); + temp_tensors[2] = vsi_nn_CreateTensor( self->graph, &attr ); } else { - temp_tensors[1] = outputs[0]; + temp_tensors[2] = outputs[0]; } ret = vsi_nn_kernel_optimize_tile_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, multiples, inputs[0]->attr.dim_num, - temp_tensors[1]->attr.size, temp_tensors[1]->attr.dim_num, + temp_tensors[2]->attr.size, temp_tensors[2]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); if (ret) @@ -111,9 +113,9 @@ static vsi_status _tile_op_compute { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\ shapes[0], (vsi_size_t)new_rank ); - reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, temp_tensors[1],\ + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, temp_tensors[2],\ shapes[2], (vsi_size_t)new_rank ); - if (reshape_tensors[0] == NULL || reshape_tensors[1] == NULL) + if (reshape_tensors[0] == NULL || reshape_tensors[2] == NULL) { VSILOGE("reshape tensor failed!"); status = VSI_FAILURE; @@ -123,32 +125,89 @@ static vsi_status _tile_op_compute memcpy( &attr, &reshape_tensors[0]->attr, sizeof(attr)); attr.is_const = FALSE; attr.vtl = TRUE; - attr.size[0] = reshape_tensors[1]->attr.size[0]; - attr.size[1] = reshape_tensors[1]->attr.size[1]; - + attr.size[0] = reshape_tensors[2]->attr.size[0]; + attr.size[1] = reshape_tensors[2]->attr.size[1]; temp_tensors[0] = vsi_nn_CreateTensor( self->graph, &attr ); + memset( &attr, 0 , sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.is_const = FALSE; + attr.vtl = FALSE; + attr.size[0] = new_rank; + attr.dim_num = 1; + + multiples_value[0] = (int32_t)shapes[1][0]; + multiples_value[1] = (int32_t)shapes[1][1]; + for (i = 0; i < new_rank; i++) + { + multiples_value[i] = 1; + } + reshape_tensors[1] = vsi_nn_CreateTensorFromData(self->graph, (uint8_t *)multiples_value, &attr); + if (reshape_tensors[1] == NULL) + { + VSILOGE("vsi_nn_CreateTensorFromData failed!"); + status = VSI_FAILURE; + goto final; + } + + multiples_value[0] = 1; + multiples_value[1] = 1; + for (i = 0; i < new_rank; i++) + { + multiples_value[i] = (int32_t)shapes[1][i]; + } + + temp_tensors[1] = vsi_nn_CreateTensorFromData(self->graph, (uint8_t *)multiples_value, &attr); + + if (temp_tensors[1] == NULL) + { + VSILOGE("vsi_nn_CreateTensorFromData failed!"); + status = VSI_FAILURE; + goto final; + } + self->n = (vx_node)vsi_nn_kernel_selector( - self->graph, kernel_name, &reshape_tensors[0], 1, &temp_tensors[0], 1, NULL); + self->graph, kernel_name, &reshape_tensors[0], 2, &temp_tensors[0], 1, NULL); self->n = (vx_node)vsi_nn_kernel_selector( - self->graph, kernel_name, &temp_tensors[0], 1, &reshape_tensors[1], 1, NULL); + self->graph, kernel_name, &temp_tensors[0], 2, &reshape_tensors[2], 1, NULL); } else { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\ shapes[0], (vsi_size_t)new_rank ); - reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, temp_tensors[1],\ + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, temp_tensors[2],\ shapes[2], (vsi_size_t)new_rank ); - if (reshape_tensors[0] == NULL || reshape_tensors[1] == NULL) + if (reshape_tensors[0] == NULL || reshape_tensors[2] == NULL) { VSILOGE("reshape tensor failed!"); status = VSI_FAILURE; goto final; } + memset( &attr, 0 , sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.is_const = TRUE; + attr.vtl = FALSE; + attr.size[0] = new_rank; + attr.dim_num = 1; + + multiples_value[0] = (int32_t)shapes[1][0]; + multiples_value[1] = (int32_t)shapes[1][1]; + multiples_value[2] = (int32_t)shapes[1][2]; + multiples_value[3] = (int32_t)shapes[1][3]; + + reshape_tensors[1] = vsi_nn_CreateTensorFromData(self->graph, (uint8_t *)multiples_value, &attr); + + if (reshape_tensors[1] == NULL) + { + VSILOGE("vsi_nn_CreateTensorFromData failed!"); + status = VSI_FAILURE; + goto final; + } + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, kernel_name,\ - &reshape_tensors[0], 1, &reshape_tensors[1], 1, NULL ); + &reshape_tensors[0], 2, &reshape_tensors[2], 1, NULL ); } } @@ -160,11 +219,13 @@ static vsi_status _tile_op_compute final: vsi_safe_release_tensor(reshape_tensors[0]); vsi_safe_release_tensor(reshape_tensors[1]); + vsi_safe_release_tensor(reshape_tensors[2]); vsi_safe_release_tensor(temp_tensors[0]); + vsi_safe_release_tensor(temp_tensors[1]); if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE) { - self->n = vxTensorCopyNode( self->graph->g, temp_tensors[1]->t, outputs[0]->t); - vsi_safe_release_tensor(temp_tensors[1]); + self->n = vxTensorCopyNode( self->graph->g, temp_tensors[2]->t, outputs[0]->t); + vsi_safe_release_tensor(temp_tensors[2]); } return status; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c index 27a3c45..b7588ab 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c @@ -184,7 +184,7 @@ void vsi_nn_BinaryTreeRemoveNode vsi_nn_binary_tree_key_t key ) { - if ( NULL != root ) + if ( NULL == root ) { return; } diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index d696e8c..e862b9a 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -466,6 +466,7 @@ static _op_param_gen_t s_op_gen[] = /* GRID_SAMPLE */ NULL, /* LPNORM */ NULL, /* RESIZE_3D */ NULL, + /* REDUCEL2 */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/vip/virtual_device.cpp b/src/tim/vx/internal/src/vip/virtual_device.cpp index 2efa849..3568f69 100644 --- a/src/tim/vx/internal/src/vip/virtual_device.cpp +++ b/src/tim/vx/internal/src/vip/virtual_device.cpp @@ -84,7 +84,7 @@ void Device::WaitThreadIdle() { Worker::Worker() { } -void Worker::RunGraph(const vsi_nn_graph_t* graph) { +void Worker::RunGraph(vsi_nn_graph_t* graph) { vsi_nn_RunGraph(graph); } diff --git a/src/tim/vx/internal/src/vip/virtual_device_private.h b/src/tim/vx/internal/src/vip/virtual_device_private.h index b0e39a0..ecec2c8 100644 --- a/src/tim/vx/internal/src/vip/virtual_device_private.h +++ b/src/tim/vx/internal/src/vip/virtual_device_private.h @@ -75,7 +75,7 @@ class Worker{ Worker(); ~Worker(){}; void Handle(const QueueItem& item); - void RunGraph(const vsi_nn_graph_t* graph); + void RunGraph(vsi_nn_graph_t* graph); protected: }; diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index c9eed9c..9954d5d 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -41,6 +41,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "vsi_nn_graph_optimization.h" #include "vsi_nn_error.h" +#include "vsi_nn_types_prv.h" static vsi_status _set_reference_node_name ( @@ -102,15 +103,71 @@ final: return status; } /* _set_reference_tensor_name() */ +static vsi_status _set_parameter_for_swap_handle + ( + vsi_nn_graph_t* graph, + vsi_nn_node_t* node, + vsi_nn_tensor_t* tensor, + uint32_t idx + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_swap_handle_cache_item_t* item = NULL; + status = vxSetParameterByIndex( node->n, idx, (vx_reference)tensor->t ); + if( VSI_SUCCESS != status ) + { + VSILOGE( "Set parameter %d for node[%08x] fail!", idx, node->n ); + goto final; + } + tensor->is_swapped = FALSE; + + if (!((vsi_nn_graph_prv_t*)graph)->swap_handle_cache.is_feature_on) { + goto final; + } + + item = (vsi_nn_swap_handle_cache_item_t *) + malloc( sizeof(vsi_nn_swap_handle_cache_item_t) ); + if( NULL == item ) + { + VSILOGE( "Create swap handle cache item fail." ); + goto final; + } + + memset( item, 0, sizeof(vsi_nn_swap_handle_cache_item_t) ); + item->node = node; + item->idx = idx; + item->tensor = tensor; + vsi_nn_LinkListPushStart( + (vsi_nn_link_list_t **)&(((vsi_nn_graph_prv_t*)graph)->swap_handle_cache.cache_list), + (vsi_nn_link_list_t *)item ); + +final: + return status; +} /* _set_parameter_for_swap_handle() */ + static vsi_status _check_swapped_tensors ( - const vsi_nn_graph_t* graph + vsi_nn_graph_t* graph ) { uint32_t i = 0; vsi_status status = VSI_SUCCESS; VSILOGD("Check swapped tensors"); + if (((vsi_nn_graph_prv_t*)graph)->swap_handle_cache.is_feature_on + && ((vsi_nn_graph_prv_t*)graph)->swap_handle_cache.is_cached) + { + vsi_nn_swap_handle_cache_item_t* cur_item = + ((vsi_nn_graph_prv_t*)graph)->swap_handle_cache.cache_list; + while( NULL != cur_item && VSI_SUCCESS == status ) + { + status = vxSetParameterByIndex( cur_item->node->n, cur_item->idx, + (vx_reference)(cur_item->tensor->t) ); + cur_item = (vsi_nn_swap_handle_cache_item_t *) + vsi_nn_LinkListNext( (vsi_nn_link_list_t *)cur_item ); + } + goto final; + } for( i = 0; i < graph->node_num; i++ ) { vsi_nn_node_t* node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i ); @@ -127,13 +184,12 @@ static vsi_status _check_swapped_tensors tensor = vsi_nn_GetTensor( graph, node->input.tensors[j] ); if( tensor && tensor->is_swapped ) { - status = vxSetParameterByIndex( node->n, idx, (vx_reference)tensor->t ); + status = _set_parameter_for_swap_handle(graph, node, tensor, idx); if( VSI_SUCCESS != status ) { - VSILOGE( "Set input parameter %d for node[%08x] fail!", idx, node->n ); + VSILOGE( "_set_parameter_for_swap_handle for input fail!"); goto final; } - tensor->is_swapped = FALSE; } idx++; } @@ -143,19 +199,23 @@ static vsi_status _check_swapped_tensors tensor = vsi_nn_GetTensor( graph, node->output.tensors[j] ); if( tensor && tensor->is_swapped ) { - status = vxSetParameterByIndex( node->n, idx, (vx_reference)tensor->t ); + status = _set_parameter_for_swap_handle(graph, node, tensor, idx); if( VSI_SUCCESS != status ) { - VSILOGE( "Set output parameter %d for node[%08x] fail!", idx, node->n ); + VSILOGE( "_set_parameter_for_swap_handle for output fail!"); goto final; } - tensor->is_swapped = FALSE; } idx++; } } } + if (NULL != ((vsi_nn_graph_prv_t*)graph)->swap_handle_cache.cache_list) + { + ((vsi_nn_graph_prv_t*)graph)->swap_handle_cache.is_cached = TRUE; + } + final: return status; } /* _check_swapped_tensors() */ @@ -581,10 +641,10 @@ vsi_nn_graph_t * vsi_nn_CreateGraph return graph; } - graph = (vsi_nn_graph_t *)malloc( sizeof( vsi_nn_graph_t ) ); + graph = (vsi_nn_graph_t *)malloc( sizeof( vsi_nn_graph_prv_t ) ); if( NULL != graph ) { - memset( graph, 0, sizeof( vsi_nn_graph_t ) ); + memset( graph, 0, sizeof( vsi_nn_graph_prv_t ) ); graph->g = vxCreateGraph( ctx->c ); if( NULL != graph->g ) { @@ -669,6 +729,16 @@ void vsi_nn_ReleaseGraph { vsi_nn_rnn_DeinitWksp( ptr ); } + if( NULL != ((vsi_nn_graph_prv_t*)ptr)->swap_handle_cache.cache_list ) + { + vsi_nn_swap_handle_cache_item_t* item = ((vsi_nn_graph_prv_t*)ptr)->swap_handle_cache.cache_list; + while( NULL != item ) + { + vsi_nn_swap_handle_cache_item_t* tmp = (vsi_nn_swap_handle_cache_item_t *) + vsi_nn_LinkListPopStart( (vsi_nn_link_list_t **)&item ); + free( tmp ); + } + } free( ptr ); *graph = NULL; } @@ -814,7 +884,7 @@ vsi_status vsi_nn_VerifyGraph vsi_status vsi_nn_RunGraph ( - const vsi_nn_graph_t * graph + vsi_nn_graph_t * graph ) { vsi_status status; @@ -1160,7 +1230,7 @@ vsi_nn_node_t * vsi_nn_GetNode node = vsi_nn_MapGet( graph->node_table, (vsi_nn_map_key_t)id ); } return node; -} /* vsi_nn_GetTensor() */ +} /* vsi_nn_GetNode() */ void vsi_nn_GetTensors ( @@ -2547,3 +2617,25 @@ final: vsi_nn_safe_free(data); return status; } /* vsi_nn_ExecuteGraphLoop() */ + + +vsi_status vsi_nn_SetGraphTransformOption + ( + vsi_nn_graph_t* graph, + const char* ctrl_str, + size_t size + ) +{ + vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(ctrl_str); + VSI_UNREFERENCED(size); +#ifdef VX_GRAPH_TRANSFORM_OPTION_SUPPORT + + if(graph && graph->g) + { + status = vxSetGraphAttribute(graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, ctrl_str, size); + } +#endif + return status; +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c index 4ffd687..641888e 100644 --- a/src/tim/vx/internal/src/vsi_nn_node.c +++ b/src/tim/vx/internal/src/vsi_nn_node.c @@ -32,6 +32,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "utils/vsi_nn_util.h" vsi_nn_node_t * vsi_nn_NewNode @@ -42,7 +43,7 @@ vsi_nn_node_t * vsi_nn_NewNode vsi_size_t output_num ) { - vsi_nn_node_t * node; + vsi_nn_node_prv_t* node; node = NULL; if(NULL == graph || FALSE == vsi_nn_OpIsValid(op)) @@ -51,59 +52,59 @@ vsi_nn_node_t * vsi_nn_NewNode goto final; } - node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) ); + node = (vsi_nn_node_prv_t *)malloc( sizeof( vsi_nn_node_prv_t ) ); if( NULL != node ) { - memset( node, 0, sizeof( vsi_nn_node_t ) ); - node->graph = graph; - node->op = op; - node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; - node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO; - node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; + memset( node, 0, sizeof( vsi_nn_node_prv_t ) ); + node->pon.graph = graph; + node->pon.op = op; + node->pon.vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + node->pon.vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO; + node->pon.vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; /* init op */ - vsi_nn_OpInit( node->op, node ); + vsi_nn_OpInit( node->pon.op, &node->pon ); if( 0 == input_num && 0 == output_num ) { - vsi_nn_OpGetIoNum( op, node, &input_num, &output_num ); + vsi_nn_OpGetIoNum( op, &node->pon, &input_num, &output_num ); } /* init output struct */ - node->output.num = (uint32_t)output_num; - node->output.tensors = (vsi_nn_tensor_id_t *) malloc( + node->pon.output.num = (uint32_t)output_num; + node->pon.output.tensors = (vsi_nn_tensor_id_t *) malloc( output_num * sizeof( vsi_nn_tensor_id_t ) ); - if (NULL == node->output.tensors) + if (NULL == node->pon.output.tensors) { goto final; } - vsi_nn_InitTensorsId( node->output.tensors, (uint32_t)output_num ); + vsi_nn_InitTensorsId( node->pon.output.tensors, (uint32_t)output_num ); /* init input struct */ - node->input.num = (uint32_t)input_num; - node->input.tensors = (vsi_nn_tensor_id_t *) malloc( + node->pon.input.num = (uint32_t)input_num; + node->pon.input.tensors = (vsi_nn_tensor_id_t *) malloc( input_num * sizeof( vsi_nn_tensor_id_t ) ); - if (NULL == node->input.tensors) + if (NULL == node->pon.input.tensors) { goto final; } - vsi_nn_InitTensorsId( node->input.tensors, (uint32_t)input_num ); - node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE; - node->attr.enable_op_constraint_check = TRUE; + vsi_nn_InitTensorsId( node->pon.input.tensors, (uint32_t)input_num ); + node->pon.attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE; + node->pon.attr.enable_op_constraint_check = TRUE; } else { goto final; } - node->uid = VSI_NN_NODE_UID_NA; + node->pon.uid = VSI_NN_NODE_UID_NA; - return node; + return (vsi_nn_node_t*)node; final: if (node) { - vsi_nn_safe_free(node->output.tensors); - vsi_nn_safe_free(node->input.tensors); + vsi_nn_safe_free(node->pon.output.tensors); + vsi_nn_safe_free(node->pon.input.tensors); } vsi_nn_safe_free(node); @@ -127,18 +128,18 @@ void vsi_nn_ReleaseNode vsi_nn_node_t ** node ) { - vsi_nn_node_t * ptr; - ptr = (NULL != node) ? *node : NULL; + vsi_nn_node_prv_t* ptr; + ptr = (NULL != node) ? (vsi_nn_node_prv_t*)*node : NULL; if( NULL != ptr) { - vsi_nn_OpDeinit( ptr->op, ptr ); - if( NULL != ptr->input.tensors ) + vsi_nn_OpDeinit( ptr->pon.op, &ptr->pon ); + if( NULL != ptr->pon.input.tensors ) { - free( ptr->input.tensors ); + free( ptr->pon.input.tensors ); } - if( NULL != ptr->output.tensors ) + if( NULL != ptr->pon.output.tensors ) { - free( ptr->output.tensors ); + free( ptr->pon.output.tensors ); } free( ptr ); *node = NULL; @@ -192,6 +193,8 @@ void vsi_nn_PrintNode uint32_t i; int count; char buf[_MAX_PRINT_BUF_SZ]; + vsi_bool is_out_of_bound = FALSE; + int temp = 0; if( NULL == node ) { @@ -200,28 +203,43 @@ void vsi_nn_PrintNode count = snprintf( &buf[0], _MAX_PRINT_BUF_SZ, "%s", "[in:" ); for( i = 0; i < node->input.num; i ++ ) { - if( count >= _MAX_PRINT_BUF_SZ ) - { - break; - } - count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, + temp = snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, " %d,", node->input.tensors[i] ); + if ( temp >= _MAX_PRINT_BUF_SZ - count || temp == -1 ) + { + is_out_of_bound = TRUE; + goto final; + } + count += temp; } - count --; - count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, + temp = snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, "%s", " ], [out:" ); + if ( temp >= _MAX_PRINT_BUF_SZ - count || temp == -1 ) + { + is_out_of_bound = TRUE; + goto final; + } + count += temp; for( i = 0; i < node->output.num; i ++ ) { - if( count >= _MAX_PRINT_BUF_SZ ) + /* -3 means reserve memory for ending symbols --" ]" */ + temp = snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count - 3, + " %d,", node->input.tensors[i] ); + if ( temp >= _MAX_PRINT_BUF_SZ - count - 3 || temp == -1 ) { - break; + is_out_of_bound = TRUE; + goto final; } - count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, - " %d,", node->output.tensors[i] ); + count += temp; } - count --; count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count, "%s", " ]" ); +final: + if ( is_out_of_bound ) + { + VSILOGW("Buffer is already full, cannot print all messages for (%16s)node[%u] [%08x]", + vsi_nn_OpGetName(node->op), id, node->n ); + } VSILOGI( "(%16s)node[%u] %s [%08x]", vsi_nn_OpGetName(node->op), id, buf, node->n ); } /* vsi_nn_PrintNode() */ diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c index b7f8b70..ca565da 100644 --- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -206,6 +206,7 @@ static _node_template s_template[] = /* REVERSESEQUENCE */ NULL, /* LPNORM */ NULL, /* RESIZE_3D */ NULL, + /* REDUCEL2 */ NULL, }; //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index 265d922..3a9ac63 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -181,10 +181,11 @@ static void _set_preproc_node_norm_params { vsi_nn_preprocess_mean_and_scale_t* means_and_single_scale = (vsi_nn_preprocess_mean_and_scale_t*)mean_and_scale; + node->nn_param.pre_process.norm.scale = means_and_single_scale->scale; node->nn_param.pre_process.norm2.scale[0] = means_and_single_scale->scale; node->nn_param.pre_process.norm2.scale[1] = means_and_single_scale->scale; node->nn_param.pre_process.norm2.scale[2] = means_and_single_scale->scale; - for(i = 0; i < means_and_single_scale->channel_len; i++) + for (i = 0; i < means_and_single_scale->channel_len; i++) { node->nn_param.pre_process.norm.mean[i] = means_and_single_scale->channel_mean[i]; } @@ -835,6 +836,30 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam vsi_nn_node_id_t* enable_nodes, uint32_t enable_nodes_count ) +{ + vsi_bool* crop_set_start_only = NULL; + vsi_status status = VSI_FAILURE; + crop_set_start_only = (vsi_bool*)malloc(enable_nodes_count * sizeof(vsi_bool)); + TEST_CHECK_PTR( crop_set_start_only, final ); + memset(crop_set_start_only, 0, enable_nodes_count * sizeof(vsi_bool)); + status = vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly(graph, enable_nodes, + crop_set_start_only, enable_nodes_count); +final: + if(crop_set_start_only) + { + free(crop_set_start_only); + crop_set_start_only = NULL; + } + return status; +} /* vs_nn_AddBinaryGraphInputsWithCropParam() */ + +vsi_status vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly +( + vsi_nn_graph_t* graph, + vsi_nn_node_id_t* enable_nodes, + vsi_bool* crop_set_start_only, + uint32_t enable_nodes_count +) { uint32_t i, j, k, idx, p; vsi_status status = VSI_FAILURE; @@ -905,7 +930,14 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam //} //else //{ - num_of_graph_real_inputs += 4; + if (crop_set_start_only[j]) + { + num_of_graph_real_inputs += 2; + } + else + { + num_of_graph_real_inputs += 4; + } //} } } @@ -1010,14 +1042,27 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam VX_SCALAR_TYPE, &data_type, sizeof(vx_enum)); - /*scale_x,scale_y,left,top are int32 - * and index <4 type,mean and - * scarlar are float*/ - if (data_type != VX_TYPE_INT32 || - scalar_index >= 4) - continue; - graph_inputs[j++] = ref; - scalar_index++; + /*corp w, h, start_x, start_y are int32 type, + * and index <4 , mean and scale are float*/ + if (crop_set_start_only[k]) + { + if (data_type != VX_TYPE_INT32) + continue; + if (scalar_index < 4 && scalar_index >=2) + { + graph_inputs[j++] = ref; + } + scalar_index++; + } + else + { + if (data_type == VX_TYPE_INT32 && + scalar_index < 4) + { + graph_inputs[j++] = ref; + scalar_index++; + } + } } } break; @@ -1055,7 +1100,6 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam graph_outputs = (vx_reference*)malloc(num_of_graph_real_outputs * sizeof(vx_reference)); TEST_CHECK_PTR( graph_outputs, final ); memset(graph_outputs, 0, num_of_graph_real_outputs * sizeof(vx_reference)); - for (i = 0, j = 0; i < num_of_graph_outputs; i++) { tensor = vsi_nn_GetTensor(graph, graph->output.tensors[i]); @@ -1099,7 +1143,7 @@ final: free(graph_outputs); } return status; -} /* vs_nn_AddBinaryGraphInputsWithCropParam() */ +} /* vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly() */ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph ( @@ -1116,8 +1160,10 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph uint32_t i, j; uint32_t numParams = 0; int32_t scalar_value[4] = {0}; + uint32_t scalar_value_idx = 0; vsi_status status = VSI_SUCCESS; uint32_t input_idx = enabled_crop_input_idx; + uint32_t scalar_num = 0; scalar_value[0] = (int32_t)((crop_w << 15) / dst_w); scalar_value[1] = (int32_t)((crop_h << 15) / dst_h); scalar_value[2] = start_x; /*rgb start_x*3, rgb start_x*4*/ @@ -1130,35 +1176,23 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph { vx_parameter param = 0; vx_enum type = 0; + vx_enum direction = 0; vx_reference ref = 0; uint32_t scalar_idx = 0; - uint32_t scalar_value_idx = 0; + uint32_t scalar_start_idx = 0; + uint32_t scalar_end_idx = 0; int32_t temp_value = 0; + uint32_t cur_input_index = 0; status |= vxQueryNode(node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams)); - for (j = 0; j < numParams; j++) - { - param = vxGetParameterByIndex(node->n, j); - - if (param) - { - status |= vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); - if (type == VX_TYPE_SCALAR) - { - scalar_idx = j; - break; - } - } - } while (input_idx > 0) { - uint32_t tensor_idx = scalar_idx + 4; - for (j = tensor_idx; j < numParams; j++) + for (j = cur_input_index; j < numParams; j++) { + param = vxGetParameterByIndex(node->n, j); if (param) { - status |= vxQueryParameter( - param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + status |= vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); if (type == VX_TYPE_SCALAR) { scalar_idx = j; @@ -1166,9 +1200,54 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph } } } - input_idx--; + for (j = scalar_idx; j < numParams; j++) + { + param = vxGetParameterByIndex(node->n, j); + if (param) + { + status |= vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + status |= vxQueryParameter(param,VX_PARAMETER_DIRECTION, &direction,sizeof(vx_enum)); + if (type == VX_TYPE_TENSOR && direction == VX_INPUT) + { + cur_input_index = j; + input_idx--; + break; + } + } + } } - for (j = scalar_idx; j < scalar_idx + 4; j++) + for (j = cur_input_index; j < numParams; j++) + { + param = vxGetParameterByIndex(node->n, j); + if(param) + { + status |= vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + scalar_start_idx = j; + break; + } + } + } + for (j = scalar_start_idx; j < numParams; j++) + { + param = vxGetParameterByIndex(node->n, j); + if (param) + { + status |= vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_TENSOR) + { + scalar_end_idx = j - 1; + break; + } + } + } + scalar_num = scalar_end_idx - scalar_start_idx + 1; + if (scalar_num == 2) + { + scalar_value_idx = 2; + } + for (j = scalar_start_idx; j < scalar_end_idx + 1; j++) { temp_value = scalar_value[scalar_value_idx++]; param = vxGetParameterByIndex(node->n, j); @@ -1183,7 +1262,8 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph } } } - + vxReleaseParameter(¶m); + param = NULL; } } return status; diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index 5f7cb47..a333d42 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -2453,6 +2453,17 @@ vsi_status vsi_nn_SwapTensorHandle return status; } /* vsi_nn_SwapTensorHandle() */ +vsi_status vsi_nn_SwapTensorHandleWithCache + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_t * tensor0, + vsi_nn_tensor_t * tensor1 + ) +{ + ((vsi_nn_graph_prv_t*)graph)->swap_handle_cache.is_feature_on = TRUE; + return vsi_nn_SwapTensorHandle(tensor0, tensor1); +} /* vsi_nn_SwapTensorHandleWithCache() */ + vsi_size_t vsi_nn_vxGetTensorElementNum ( vsi_nn_tensor_attr_t *attr diff --git a/src/tim/vx/internal/src/vsi_nn_types_prv.h b/src/tim/vx/internal/src/vsi_nn_types_prv.h index a2c2b56..81b1d36 100644 --- a/src/tim/vx/internal/src/vsi_nn_types_prv.h +++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h @@ -33,6 +33,21 @@ extern "C"{ #endif +typedef struct _vsi_nn_swap_handle_cache_item +{ + vsi_nn_link_list_t link_list; + vsi_nn_node_t* node; + vsi_nn_tensor_t* tensor; + uint32_t idx; +} vsi_nn_swap_handle_cache_item_t; + +typedef struct _vsi_nn_swap_handle_cache +{ + int8_t is_feature_on; + int8_t is_cached; + vsi_nn_swap_handle_cache_item_t* cache_list; +} vsi_nn_swap_handle_cache_t; + /** * Internal Graph structure, internal use only. */ @@ -42,6 +57,7 @@ typedef struct _vsi_nn_graph_prv vsi_nn_graph_t pog; // Add graph internal attribute here... + vsi_nn_swap_handle_cache_t swap_handle_cache; } vsi_nn_graph_prv_t; /** Internal Node structure, internal use only. */ @@ -50,6 +66,12 @@ typedef struct _vsi_nn_node_prv /** Public Ovxlib Node(pon)*/ vsi_nn_node_t pon; + /** some op will adjust its const tensors in op_setup, + * this field is to indicate whether the const tensors + * are processed in op_setup phase, this adjustment cannot + * be done more than once */ + int8_t processed; + // Add node internal attribute here... } vsi_nn_node_prv_t; @@ -67,6 +89,12 @@ typedef struct _vsi_nn_tensor_prv /** is scalar*/ int8_t is_scalar; + /** some op will adjust its const tensors in op_setup, + * this field is to indicate whether the const tensors + * are processed in op_setup phase, this adjustment cannot + * be done more than once */ + int8_t processed; + // Add tensor internal attribute here... } vsi_nn_tensor_prv_t;