From 8494275d7608942aa584c9c13bd5e2d77be9906c Mon Sep 17 00:00:00 2001 From: Chen Feiyue <69809761+chenfeiyue-cfy@users.noreply.github.com> Date: Wed, 8 Jan 2025 13:22:46 +0800 Subject: [PATCH] Update internal ovxlib to release/1.2.22 (#706) * Update internal ovxlib to release/1.2.22 Signed-off-by: Feiyue.Chen * Refine yaml file for blocking tfhub model tests Signed-off-by: Feiyue.Chen --------- Signed-off-by: Feiyue.Chen --- .github/workflows/cmake_x86_vsim.yml | 106 +- VERSION | 2 +- .../include/custom/custom_node_type.def | 1 + .../vx/internal/include/custom/custom_ops.def | 1 + .../custom/ops/vsi_nn_op_custom_letterbox.h | 61 + .../include/custom/vsi_nn_custom_node_type.h | 1 + src/tim/vx/internal/include/interface/ops.def | 1 + .../include/ops/vsi_nn_op_pre_process_rgb.h | 2 +- .../vx/internal/include/ops/vsi_nn_op_rope.h | 49 + .../vx/internal/include/ops/vsi_nn_op_topk.h | 1 + .../include/utils/vsi_nn_dtype_util_prv.h | 16 +- src/tim/vx/internal/include/vsi_nn/vsi_nn.h | 2034 ---------- src/tim/vx/internal/include/vsi_nn_context.h | 18 +- .../internal/include/vsi_nn_feature_config.h | 3 + .../vx/internal/include/vsi_nn_node_type.h | 2 + src/tim/vx/internal/include/vsi_nn_tensor.h | 4 +- .../vx/internal/include/vsi_nn_tensor_util.h | 28 + src/tim/vx/internal/include/vsi_nn_version.h | 2 +- .../ops/kernel/evis/custom_letterbox_evis.c | 475 +++ .../ops/kernel/evis/custom_softmax_evis.c | 124 +- .../custom/ops/vsi_nn_op_custom_letterbox.c | 227 ++ src/tim/vx/internal/src/kernel/cl/cumsum_cl.c | 6 + .../vx/internal/src/kernel/cl/matrixmul_cl.c | 4 +- .../vx/internal/src/kernel/cl/one_hot_cl.c | 1 + src/tim/vx/internal/src/kernel/cl/prelu_cl.c | 2 +- src/tim/vx/internal/src/kernel/cl/rope_cl.c | 329 ++ src/tim/vx/internal/src/kernel/cl/swish_cl.c | 3 +- src/tim/vx/internal/src/kernel/cl/topk_cl.c | 8 +- .../internal/src/kernel/evis/bucketize_evis.c | 3 +- .../src/kernel/evis/depthwise_conv1d_evis.c | 4 +- .../kernel/evis/group_normalization_evis.c | 21 +- .../vx/internal/src/kernel/evis/prelu_evis.c | 230 +- .../src/kernel/evis/resize_bilinear_evis.c | 607 +-- .../vx/internal/src/kernel/evis/rope_evis.c | 744 ++++ .../src/kernel/evis/scatter_nd_update_evis.c | 29 + .../vx/internal/src/kernel/evis/swish_evis.c | 3 +- .../vx/internal/src/kernel/vsi_nn_kernel.c | 24 +- .../src/kernel/vsi_nn_kernel_selector.c | 7 +- .../vx/internal/src/kernel/vx/group_norm_vx.c | 89 + .../internal/src/kernel/vx/instance_norm_vx.c | 87 + .../vx/internal/src/kernel/vx/layer_norm_vx.c | 22 +- src/tim/vx/internal/src/kernel/vx/pad2_vx.c | 3 +- .../internal/src/kernel/vx/relationalops_vx.c | 27 +- src/tim/vx/internal/src/kernel/vx/swish_vx.c | 3 +- .../vx/internal/src/libnnext/ops/cl/cumsum.cl | 129 +- .../internal/src/libnnext/ops/cl/cumsum_2d.cl | 554 +-- .../internal/src/libnnext/ops/cl/one_hot.cl | 27 + .../vx/internal/src/libnnext/ops/cl/rope_0.cl | 373 ++ .../src/libnnext/ops/vx/custom_letterbox.vx | 307 ++ .../src/libnnext/ops/vx/custom_softmax.vx | 94 +- .../libnnext/ops/vx/group_normalization_2.vx | 44 +- .../vx/internal/src/libnnext/ops/vx/prelu.vx | 161 +- ...resize_bilinear_U8_half_pixel_centers_3.vx | 181 + ...resize_bilinear_U8_half_pixel_centers_4.vx | 102 + ...resize_bilinear_U8_half_pixel_centers_5.vx | 167 + .../vx/internal/src/libnnext/ops/vx/rope_0.vx | 303 ++ .../vx/internal/src/libnnext/ops/vx/rope_1.vx | 245 ++ .../vx/internal/src/libnnext/ops/vx/rope_2.vx | 312 ++ .../vx/internal/src/libnnext/ops/vx/rope_3.vx | 312 ++ .../ops/vx/scatter_nd_update_special.vx | 98 + .../src/libnnext/vsi_nn_libnnext_resource.c | 3438 ++++++++++++++--- .../internal/src/libnnext/vsi_nn_vxkernel.c | 21 +- .../internal/src/ops/vsi_nn_op_batch_norm.c | 192 +- .../vx/internal/src/ops/vsi_nn_op_bitcast.c | 1 + .../vx/internal/src/ops/vsi_nn_op_cumsum.c | 1 + .../internal/src/ops/vsi_nn_op_dataconvert.c | 1 + .../vx/internal/src/ops/vsi_nn_op_eltwise.c | 13 +- .../vx/internal/src/ops/vsi_nn_op_gather.c | 1 + .../src/ops/vsi_nn_op_groupnormalize.c | 2 + .../vx/internal/src/ops/vsi_nn_op_grucell.c | 5 +- .../src/ops/vsi_nn_op_l1_layer_norm.c | 22 - .../src/ops/vsi_nn_op_l2normalizescale.c | 3 +- .../src/ops/vsi_nn_op_layernormalize.c | 2 +- .../src/ops/vsi_nn_op_lstmunit_activation.c | 3 +- .../vx/internal/src/ops/vsi_nn_op_one_hot.c | 6 + .../src/ops/vsi_nn_op_pre_process_rgb.c | 98 +- src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c | 96 +- .../vx/internal/src/ops/vsi_nn_op_reduce.c | 4 +- .../vx/internal/src/ops/vsi_nn_op_reshape.c | 31 +- .../vx/internal/src/ops/vsi_nn_op_reshape2.c | 27 +- src/tim/vx/internal/src/ops/vsi_nn_op_rope.c | 145 + .../src/ops/vsi_nn_op_softmax_internal.c | 3 +- src/tim/vx/internal/src/ops/vsi_nn_op_topk.c | 97 +- .../src/utils/vsi_nn_code_generator.c | 1 + src/tim/vx/internal/src/utils/vsi_nn_dtype.c | 2 +- src/tim/vx/internal/src/utils/vsi_nn_util.c | 17 +- src/tim/vx/internal/src/vsi_nn_context.c | 127 + src/tim/vx/internal/src/vsi_nn_graph.c | 5 +- .../internal/src/vsi_nn_graph_optimization.c | 9 +- .../vx/internal/src/vsi_nn_pre_post_process.c | 46 +- src/tim/vx/internal/src/vsi_nn_rnn_helper.c | 3 +- src/tim/vx/internal/src/vsi_nn_tensor.c | 91 + .../vx/internal/src/vsi_nn_tensor_util_prv.h | 10 + src/tim/vx/internal/src/vsi_nn_types_prv.h | 5 + 94 files changed, 9466 insertions(+), 3885 deletions(-) create mode 100644 src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_rope.h delete mode 100644 src/tim/vx/internal/include/vsi_nn/vsi_nn.h create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c create mode 100644 src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c create mode 100644 src/tim/vx/internal/src/kernel/cl/rope_cl.c create mode 100644 src/tim/vx/internal/src/kernel/evis/rope_evis.c create mode 100644 src/tim/vx/internal/src/kernel/vx/group_norm_vx.c create mode 100644 src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_rope.c diff --git a/.github/workflows/cmake_x86_vsim.yml b/.github/workflows/cmake_x86_vsim.yml index 02a60b0..45d399a 100644 --- a/.github/workflows/cmake_x86_vsim.yml +++ b/.github/workflows/cmake_x86_vsim.yml @@ -124,7 +124,7 @@ jobs: run: | git config --global user.email "xiang.zhang@verisilicon.com" git config --global user.name "xiang.zhang" - git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.10.0 + git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.16.1 git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}} @@ -283,61 +283,61 @@ jobs: # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite - tfhub-efficientdet-lite0: - runs-on: ubuntu-latest - needs: [vx-delegate-build, tim-vx-unit-test] - steps: - - name: download test binary - uses: actions/download-artifact@v3 - - name: download model - run: | - wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite - - name: benchmark-model - run: | - chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model - ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite + # tfhub-efficientdet-lite0: + # runs-on: ubuntu-latest + # needs: [vx-delegate-build, tim-vx-unit-test] + # steps: + # - name: download test binary + # uses: actions/download-artifact@v3 + # - name: download model + # run: | + # wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite + # - name: benchmark-model + # run: | + # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model + # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite - tfhub-efficientdet-lite1: - runs-on: ubuntu-latest - needs: [vx-delegate-build, tim-vx-unit-test] - steps: - - name: download test binary - uses: actions/download-artifact@v3 - - name: download model - run: | - wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite - - name: benchmark-model - run: | - chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model - ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite + # tfhub-efficientdet-lite1: + # runs-on: ubuntu-latest + # needs: [vx-delegate-build, tim-vx-unit-test] + # steps: + # - name: download test binary + # uses: actions/download-artifact@v3 + # - name: download model + # run: | + # wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite + # - name: benchmark-model + # run: | + # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model + # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite - tfhub-efficientdet-lite2: - runs-on: ubuntu-latest - needs: [vx-delegate-build, tim-vx-unit-test] - steps: - - name: download test binary - uses: actions/download-artifact@v3 - - name: download model - run: | - wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite - - name: benchmark-model - run: | - chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model - ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite + # tfhub-efficientdet-lite2: + # runs-on: ubuntu-latest + # needs: [vx-delegate-build, tim-vx-unit-test] + # steps: + # - name: download test binary + # uses: actions/download-artifact@v3 + # - name: download model + # run: | + # wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite + # - name: benchmark-model + # run: | + # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model + # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite - tfhub-efficientdet-lite3: - runs-on: ubuntu-latest - needs: [vx-delegate-build, tim-vx-unit-test] - steps: - - name: download test binary - uses: actions/download-artifact@v3 - - name: download model - run: | - wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite - - name: benchmark-model - run: | - chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model - ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite + # tfhub-efficientdet-lite3: + # runs-on: ubuntu-latest + # needs: [vx-delegate-build, tim-vx-unit-test] + # steps: + # - name: download test binary + # uses: actions/download-artifact@v3 + # - name: download model + # run: | + # wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite + # - name: benchmark-model + # run: | + # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model + # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite # acuity-yolov3-608-quant: # runs-on: ubuntu-latest diff --git a/VERSION b/VERSION index fd9d1a5..9a83513 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.14 +1.2.22 diff --git a/src/tim/vx/internal/include/custom/custom_node_type.def b/src/tim/vx/internal/include/custom/custom_node_type.def index c5ef3e0..9ac9424 100644 --- a/src/tim/vx/internal/include/custom/custom_node_type.def +++ b/src/tim/vx/internal/include/custom/custom_node_type.def @@ -9,3 +9,4 @@ DEF_NODE_TYPE(custom_sample) DEF_NODE_TYPE(custom_tiny_yolov4_postprocess) DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence) DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box) +DEF_NODE_TYPE(custom_letterbox) \ No newline at end of file diff --git a/src/tim/vx/internal/include/custom/custom_ops.def b/src/tim/vx/internal/include/custom/custom_ops.def index 2074b8f..47f25f2 100644 --- a/src/tim/vx/internal/include/custom/custom_ops.def +++ b/src/tim/vx/internal/include/custom/custom_ops.def @@ -9,3 +9,4 @@ DEF_OP(CUSTOM_SAMPLE) DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS) DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE) DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX) +DEF_OP(CUSTOM_LETTERBOX) diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h new file mode 100644 index 0000000..ef01263 --- /dev/null +++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_letterbox.h @@ -0,0 +1,61 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CUSTOM_LETTERBOX_H +#define _VSI_NN_OP_CUSTOM_LETTERBOX_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_custom_letterbox_param +{ + struct _custom_letterbox_local_data_t* local; + int32_t new_shape_w; + int32_t new_shape_h; + vx_bool auto_bool; + vx_bool scaleFill; + vx_bool scaleup; + int32_t stride; + vx_bool center; + float mean_r; + float mean_g; + float mean_b; + float scale_r; + float scale_g; + float scale_b; + int32_t pad_value_r; + int32_t pad_value_g; + int32_t pad_value_b; + vx_bool reverse_channel; +} vsi_nn_custom_letterbox_param; +_compiler_assert(offsetof(vsi_nn_custom_letterbox_param, local) == 0, \ + vsi_nn_custom_lertterbox_h ); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h index eb23a20..2c83d81 100644 --- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h +++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h @@ -34,5 +34,6 @@ #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h" #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h" #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h" +#include "custom/ops/vsi_nn_op_custom_letterbox.h" #endif diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index 23d3f74..28f5716 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -203,3 +203,4 @@ DEF_OP(BITCAST) DEF_OP(GROUPED_CONV3D) DEF_OP(COL2IM) DEF_OP(L1_LAYER_NORM) +DEF_OP(ROPE) diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h index 9e05a59..0944ae6 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h @@ -80,7 +80,7 @@ typedef struct _vsi_nn_pre_process_rgb_param float g_scale; float b_scale; /* pre process rgb layer local data structure */ - vsi_nn_pre_process_rgb_lcl_data local; + vsi_nn_pre_process_rgb_lcl_data *local; } vsi_nn_pre_process_rgb_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h new file mode 100644 index 0000000..7d16fb0 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rope.h @@ -0,0 +1,49 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_ROPE_H +#define _VSI_NN_OP_ROPE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_rope_param +{ + struct _rope_local_data_t* local; + // Add parameters here + int32_t axis; + vsi_bool interleaved; +} vsi_nn_rope_param; +_compiler_assert(offsetof(vsi_nn_rope_param, local) == 0, \ + vsi_nn_rope_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h index bccc0b5..99d57e2 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h @@ -34,6 +34,7 @@ typedef struct _vsi_nn_topk_param { uint32_t k; int32_t axis; + struct _topk_local_data_t* local; } vsi_nn_topk_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h index ed78571..b005473 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -384,25 +384,17 @@ static VSI_INLINE_API float fp16_to_fp32 static VSI_INLINE_API float bfp16_to_fp32 ( - int16_t in + uint16_t in ) { - uint32_t t1, t2, t3; float out; fp32_bit_cast_t fp32_bit_cast; - t1 = in & 0x00FF; // Mantissa - t2 = in & 0xFF00; // Sign bit + Exponent - t3 = in & 0x7F00; // Exponent + fp32_bit_cast.data = (uint32_t)(in << 16); - t1 <<= 16; - t2 <<= 16; // Shift (sign + Exponent) bit into position - t1 |= t2; // Re-insert (sign + Exponent) bit - - fp32_bit_cast.data = t1; out = fp32_bit_cast.val; - return t3 == 0 ? 0.0f : out; + return out; } /* bfp16_to_fp32() */ static VSI_INLINE_API uint16_t fp32_to_fp16 @@ -720,7 +712,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32 *dst = fp16_to_fp32( *(int16_t *)src ); break; case VSI_NN_TYPE_BFLOAT16: - *dst = bfp16_to_fp32( *(int16_t *)src ); + *dst = bfp16_to_fp32( *(uint16_t *)src ); break; case VSI_NN_TYPE_FLOAT8_E4M3: *dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale); diff --git a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h deleted file mode 100644 index 115a2e8..0000000 --- a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h +++ /dev/null @@ -1,2034 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -/** - * @file vsi_nn.h - */ -#ifndef _VSI_NN_INTERFACE_H -#define _VSI_NN_INTERFACE_H - -#if defined(_MSC_VER) -#define EXPORT __declspec(dllexport) -#elif defined(__linux__) -#define EXPORT __attribute__((visibility("default"))) -#else -#define EXPORT -#endif - -#if !defined(_IN) -#define _IN -#endif -#if !defined(_OUT) -#define _OUT -#endif -#if !defined(_INOUT) -#define _INOUT -#endif -#if !defined(_OPTIONAL) -#define _OPTIONAL -#endif - -#include -#include - -#if defined(__cplusplus) -#define __BEGIN_DECLS extern "C" { -#define __END_DECLS } -#else -#define __BEGIN_DECLS -#define __END_DECLS -#endif - -__BEGIN_DECLS - - -#ifndef TRUE -#define TRUE (1) -#endif -#ifndef FALSE -#define FALSE (0) -#endif - - -/** - * Return codes. - */ -typedef enum -{ - /** - * Operation was succesful. - */ - VSI_NN_ERROR_OK = 0, - - /** - * Failure caused by vsi_nn api fail. - */ - VSI_NN_ERROR_API_FAIL = 1, - - /** - * Failure caused by not enough available memory. - */ - VSI_NN_ERROR_OUT_OF_MEMORY = 2, - - /** - * Failure caused by unexpected null argument. - */ - VSI_NN_ERROR_UNEXPECTED_NULL = 3, - - /** - * Failure caused by invalid function arguments, invalid model definition, - * invalid execution definition or invalid data at execution time. - */ - VSI_NN_ERROR_VALUED_ERROR = 4, - - /** - * Failure caused by operations that need completed graph. - */ - VSI_NN_ERROR_UNCOMPLETE_GRAPH = 5, - - /** - * Failure caused by insearting a keyword argument repeatly. - */ - VSI_NN_ERROR_KWARGS_REPEAT = 6, -} VSI_NN_error_e; - -/** - * Implicit padding algorithms. - */ -typedef enum -{ - /** - * Pad with const value which are specific by others parameters. - */ - VSI_NN_IMPLICIT_PADDING_NONE = 0, - - /** - * Implicit(VALID) padding. - * No padding. - */ - VSI_NN_IMPLICIT_PADDING_VALID = 1, - - /** - * Implicit(SAME) padding. - * Padding on both ends are the "same". - */ - VSI_NN_IMPLICIT_PADDING_SAME = 2, -} VSI_NN_implicit_padding_e; - -/** - * Padding mode. - */ -typedef enum -{ - /** - * Pad with const value which are specific by others parameters, default 0. - */ - VSI_NN_PADDING_MODE_CONSTANT = 0, - - /** - * Reflect padding mode - */ - VSI_NN_PADDING_MODE_REFLECT = 1, - - /** - * Symmetric padding mode - */ - VSI_NN_PADDING_MODE_SYMMETRIC = 2, - - /** - * Replicate padding mode - */ - VSI_NN_PADDING_MODE_REPLICATE = 3, -} VSI_NN_padding_mode_e; - -/** - * Rounding methods - */ -typedef enum -{ - /** - * Floor rounding - */ - VSI_NN_ROUNDING_FLOOR = 0, - /** - * Ceiling rounding - */ - VSI_NN_ROUNDING_CEIL = 1, -} VSI_NN_rounding_e; - -/** - * LSH Projection supported types. - */ -typedef enum -{ - /** - * Computed bit vector is considered to be sparse. - */ - VSI_NN_LSH_PROJECTION_SPARSE = 1, - /** - * Computed bit vector is considered to be dense. - */ - VSI_NN_LSH_PROJECTION_DENSE = 2, -} VSI_NN_lsh_projection_type_e; - -/** - * Supported activation function types. - */ -typedef enum -{ - /** No activation */ - VSI_NN_ACTIVATION_NONE = 0, - /** ReLU activation */ - VSI_NN_ACTIVATION_RELU = 1, - /** ReLU1 activation */ - VSI_NN_ACTIVATION_RELU1 = 2, - /** ReLU6 activation */ - VSI_NN_ACTIVATION_RELU6 = 3, - /** TanH activation */ - VSI_NN_ACTIVATION_TANH = 4, - /** Sigmoid activation */ - VSI_NN_ACTIVATION_SIGMOID = 5, -} VSI_NN_activation_e; - -/** - * Tensor types. - * - * The type of tensors that can be added to a graph. - */ -typedef enum -{ - /** A tensor of IEEE 754 16 bit floating point values */ - VSI_NN_TENSOR_FLOAT16 = 0, - /** A tensor of 32 bit floating point values */ - VSI_NN_TENSOR_FLOAT32 = 1, - /** A tensor of 64 bit floating point values */ - VSI_NN_TENSOR_FLOAT64 = 2, - /** - * A tensor of 8 bit boolean values. - * - * Values of this operand type are either true or false. A zero value - * represents false; any other value represents true. - */ - VSI_NN_TENSOR_BOOL8 = 3, - /** A tensor of 8 bit integer values */ - VSI_NN_TENSOR_INT8 = 4, - /** A tensor of 16 bit integer values */ - VSI_NN_TENSOR_INT16 = 5, - /** A tensor of 32 bit integer values */ - VSI_NN_TENSOR_INT32 = 6, - /** A tensor of 64 bit integer values */ - VSI_NN_TENSOR_INT64 = 7, - /** A tensor of 8 bit unsigned integer values */ - VSI_NN_TENSOR_UINT8 = 8, - /** A tensor of 16 bit unsigned integer values */ - VSI_NN_TENSOR_UINT16 = 9, - /** A tensor of 32 bit unsigned integer values */ - VSI_NN_TENSOR_UINT32 = 10, - /** A tensor of 64 bit unsigned integer values */ - VSI_NN_TENSOR_UINT64 = 11, - /** A tensor of 16 bit truncate floating point values */ - VSI_NN_TENSOR_BFLOAT16 = 12, -} VSI_NN_tensor_type_e; - -typedef enum { - /** Not a quantized tensor */ - VSI_NN_TENSOR_QUANT_NONE = 0, - /** - * A tensor of 8 bit signed integer values that represent real numbers - * - * Attached to this tensor is a number that can be used to convert - * the 8 bit integer to the real value. - * - * fraction_length: a 32 bit signed integer, in range [-128, 127]. - * - * The formula is: - * real_value = integer_value / pow(2, fraction_length). - */ - VSI_NN_TENSOR_QUANT8_DFP = 1, - /** - * A tensor of 16 bit signed integer values that represent real numbers - * - * Attached to this tensor is a number that can be used to convert - * the 16 bit integer to the real value. - * - * fraction_length: a 32 bit signed integer, in range [-128, 127]. - * - * The formula is: - * real_value = integer_value / pow(2, fraction_length). - */ - VSI_NN_TENSOR_QUANT16_DFP = 2, - /** - * A tensor of 32 bit signed integer values that represent real numbers - * - * Attached to this tensor is a number that can be used to convert - * the 16 bit integer to the real value. - * - * fraction_length: a 32 bit signed integer, in range [-128, 127]. - * - * The formula is: - * real_value = integer_value / pow(2, fraction_length). - */ - VSI_NN_TENSOR_QUANT32_DFP = 3, - /** - * A tensor of 64 bit signed integer values that represent real numbers - * - * Attached to this tensor is a number that can be used to convert - * the 16 bit integer to the real value. - * - * fraction_length: a 32 bit signed integer, in range [-128, 127]. - * - * The formula is: - * real_value = integer_value / pow(2, fraction_length). - */ - VSI_NN_TENSOR_QUANT64_DFP = 4, - /** - * A tensor of 8 bit signed integer values that represent real numbers - * - * Attached to this tensor is a numbers that can be used to convert - * the 8 bit integer to the real value. - * - * scale: a 32 bit floating point value greater than zero. - * - * The formula is: - * real_value = integer_value * scale. - */ - VSI_NN_TENSOR_QUANT8_SYMM = 5, - /** - * A tensor of 32 bit signed integer values that represent real numbers - * - * Attached to this tensor is a numbers that can be used to convert - * the 8 bit integer to the real value. - * - * scale: a 32 bit floating point value greater than zero. - * - * The formula is: - * real_value = integer_value * scale. - */ - VSI_NN_TENSOR_QUANT32_SYMM = 6, - /** - * A tensor of 8 bit unsigned integer values that represent real numbers - * - * Attached to this tensor are two numbers that can be used to convert - * the 8 bit integer to the real value. - * - * scale: a 32 bit floating point value greater than zero. - * zero_point: a 32 bit signed integer, in range [0, 255]. - * - * The formula is: - * real_value = (integer_value - zero_point) * scale. - */ - VSI_NN_TENSOR_QUANT8_ASYMM = 7, - /** - * A tensor of 8 bit signed integers that represent real numbers. - * - * Attached to this tensor are two numbers that can be used to convert - * the 8 bit integer to the real value. - * - * channel_dim: a 32 bit unsigned integer indicating channel dimension. - * scales: an array of positive 32 bit floating point values. - * The size of the scales array must be equal to shape[channel_dim]. - * - * The formula is: - * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C] - * where C is an index in the Channel dimension. - */ - VSI_NN_TENSOR_QUANT8_PERCHANNEL_SYMM = 8, - /** - * A tensor of 32 bit signed integers that represent real numbers. - * - * Attached to this tensor are two numbers that can be used to convert - * the 8 bit integer to the real value. - * - * channel_dim: a 32 bit unsigned integer indicating channel dimension. - * scales: an array of positive 32 bit floating point values. - * The size of the scales array must be equal to shape[channel_dim]. - * - * The formula is: - * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C] - * where C is an index in the Channel dimension. - */ - VSI_NN_TENSOR_QUANT32_PERCHANNEL_SYMM = 9, -} VSI_NN_tensor_quant_type_e; - -/** Parameters for VSI_NN_TENSOR_QUANT8_ASYMM */ -typedef struct -{ - float scale; - int32_t zero_point; -} VSI_NN_quant_param_asymm; - -/** Parameters for VSI_NN_TENSOR_QUANT8_SYMM */ -typedef struct -{ - float scale; -} VSI_NN_quant_param_symm; - -/** Parameters for VSI_NN_TENSOR_QUANT8_DFP */ -typedef struct -{ - int32_t fraction_length; -} VSI_NN_quant_param_dfp; - -/** Parameters for VSI_NN_TENSOR_QUANT8_PERCHANNEL_SYMM */ -typedef struct -{ - /** The index of the channel dimension. */ - int32_t channel_dim; - - /** - * The array of scaling values for each channel. - * Each value must be greater than zero. - */ - const float* scales; - - /** - * The size of the scale array. - * Should be equal to shape[channel_dim] of the tensor. - * */ - int32_t scale_count; -} VSI_NN_quant_param_perchannel_symm; - -/** Parameters for quantization */ -typedef struct -{ - /** Tensor quantize type */ - VSI_NN_tensor_quant_type_e type; - union - { - /** Dynamic fixed point quantization */ - VSI_NN_quant_param_dfp dfp; - /** Asymmetric affine quantization */ - VSI_NN_quant_param_asymm asymm; - /** Symmetric affine quantization */ - VSI_NN_quant_param_symm symm; - /** Perchannel symmetric affine quantization */ - VSI_NN_quant_param_perchannel_symm perchannel_symm; - } param; -} VSI_NN_tensor_quant_param; - -/** - * NN Runtime context - */ -typedef struct _vsi_nn_context_t VSI_NN_context; - -/** - * VSI_NN_graph is an opaque type that contains a description of the network operations. - * - * Create graph by calling VSI_NN_graph_create. - * A graph is completed by calling VSI_NN_graph_verify. - * A graph is destroyed by calling VSI_NN_graph_release. - * - */ -typedef struct _vsi_nn_graph VSI_NN_graph; - -/** - * VSI_NN_tensor is an opaque type that can be used to describe a tensor. - * - * Create tensor by calling VSI_NN_tensor_create. - * - */ -typedef struct _vsi_nn_tensor VSI_NN_tensor; - -/** - * Create context - * - * @return Context handle on success or NULL otherwise. - */ -EXPORT VSI_NN_context* VSI_NN_context_create(); - -/** - * Release context - * - * @param[in] ctx_ptr The pointer to context to release, and reset point to null. - */ -EXPORT void VSI_NN_context_release - ( - _IN VSI_NN_context** ctx_ptr - ); - -/** - * Create graph - * Create a net graph. - * - * @param[in] ctx The context used to create graph. - * @return The graph on success, or NULL otherwise. - */ -EXPORT VSI_NN_graph* VSI_NN_graph_create - ( - VSI_NN_context* ctx - ); - -/** - * Release graph - * Release a graph and free its resource. - * - * @param[in] graph_ptr The graph to be release. - */ -EXPORT void VSI_NN_graph_release - ( - _IN VSI_NN_graph** graph_ptr - ); - -/** - * Identify graph inputs and outputs - * Identify the input and output tensors of a graph. User should call this to - * specific the inputs and outputs, they are used to exchange data between application - * level and VSI_NN level. - * - * @param[in] graph The graph to be identify. - * @param[in] input_tensors Input tensors. - * @param[in] input_tensors_num Number of input tensors. - * @param[in] output_tensors Output tensors. - * @param[in] output_tensors_num Number of output tensors. - * @return VSI_NN_ERROR_OK on success - */ -EXPORT VSI_NN_error_e VSI_NN_graph_identify_input_output - ( - _IN VSI_NN_graph* graph, - _IN const VSI_NN_tensor** input_tensors, - _IN const int32_t input_tensors_num, - _IN const VSI_NN_tensor** output_tensors, - _IN const int32_t output_tensors_num - ); - -/** - * To freeze a graph with verifying and compiling. - * - * This function may take a long time to compile the graph, and it must only be called - * once for a given graph. - * - * A frozen graph cannot be modified. - * - * @param[in] graph The graph to be finished. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_graph_verify - ( - _IN VSI_NN_graph* graph - ); - -/** - * Compute a frozen graph. - * - * @param[in] graph The graph to be executed. - * - * @return VSI_NN_ERROR_OK on success. VSI_NN_ERROR_UNCOMPLETE_GRAPH if - * the graph is not finished. - */ -EXPORT VSI_NN_error_e VSI_NN_graph_compute - ( - _IN const VSI_NN_graph* graph - ); - -//EXPORT VSI_NN_error_e VSI_NN_GRPAH_profile(_IN const VSI_NN_graph* graph); - -/** - * Add a tensor to a graph. - * - * @param[in] graph The graph to be added. - * @param[in] dtype The data type. - * @param[in] shape The shape for the tensor. - * @param[in] ndim The rank for the tensor. - * @param[in] memory The memory address to the data, the memory address - * must be 64-byte align. If it's set to null, vsi_nn can - * optimize the memory allocation and this is default behavior. - * @param[in] memory_size The size of memory. - * @param[in] quant_param The quantization parameters for the tensor, set - * null if it's not quantized tensor. - * - * @return Tensor handle on success, or NULL if get failure. - */ -EXPORT VSI_NN_tensor* VSI_NN_tensor_create - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor_type_e dtype, - _IN const int32_t* shape, - _IN int32_t ndim, - _IN const VSI_NN_tensor_quant_param* quant_param, - _IN void* memory, - _IN size_t memory_size, - _IN int32_t is_constant - ); - -/** - * Add a virtual tensor to a graph. - * - * @param[in] graph The graph to be added. - * @param[in] dtype The data type. - * - * @return Tensor handle on success, or NULL if get failure. - */ -EXPORT VSI_NN_tensor* VSI_NN_tensor_create_virtual - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor_type_e dtype, - _IN const VSI_NN_tensor_quant_param* quant_param - ); - -/** - * Get element size of a tensor. - * - * @param[in] tensor Tensor to query element size. - * - * @return Element size of the tensor. - */ -EXPORT int32_t VSI_NN_tensor_get_size - ( - _IN const VSI_NN_tensor* tensor - ); - -/** - * Get bytes of a tensor. - * - * @param[in] tensor Tensor to query element size. - * - * @return Bytes of the tensor. - */ -EXPORT int32_t VSI_NN_tensor_get_bytes - ( - _IN const VSI_NN_tensor* tensor - ); - -/** - * Read tensor data. - * - * @param[in] tensor Tensor to read. - * @param[in] memory Memory to fill the data. - * @param[in] memory_size Element size of the read data, - * must be equal to tensor size. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_tensor_read - ( - _IN VSI_NN_tensor* tensor, - _IN void* memory, - _IN size_t memory_size - ); - -/** - * Write data to tensor. - * - * @param[in] tensor Tensor to write. - * @param[in] memory Memory with the data. - * @param[in] memory_size Element size of the write data, - * must be equal to tensor size. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_tensor_write - ( - _IN VSI_NN_tensor* tensor, - _IN void* memory, - _IN size_t memory_size - ); - -/** - * Swap tensors' memories. - * - * @param[in] tensor1 Tensor to swap the memory. - * @param[in] tensor2 Tensor to swap the memory. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_tensor_swap - ( - _IN VSI_NN_tensor* tensor1, - _IN VSI_NN_tensor* tensor2 - ); - -/** - * Swap tensor memories. - * User can use this api to get tensor's original memory. - * - * @param[in] tensor Tensor to swap the memory. - * @param[in] new_memory The new memory for the tensor, - * if NULL, there is no memory swapped. - * @param[in] old_memory Pointer for the tensor's original memory. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_tensor_swap_memory - ( - _IN VSI_NN_tensor* tensor, - _IN _OPTIONAL void* new_memory, - _INOUT void** old_memory - ); - -/** - * Flush tensor memory - * Once a tensor's memory is dirty, user should call this api to sync NPU memory. - * - * @param[in] tensor Tensor to flush memory - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_tensor_flush_memory - ( - _IN const VSI_NN_tensor* tensor - ); - -/** Convolutional */ -/** - * Convolution 1D node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] kernel Kernel with a 3D tensor. - * @param[in] bias Bias with a 1D tensor. - * @param[in] output Node output tensor. - * @param[in] stride Convolution stride. - * @param[in] dilation Convolution dilation rate. - * @param[in] pad_front Padding front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_end Padding end value. - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_conv_1d - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* kernel, - _IN _OPTIONAL VSI_NN_tensor* bias, - _IN VSI_NN_tensor* output, - _IN int32_t stride, - _IN int32_t dilation, - _IN int32_t pad_front, _IN int32_t pad_end, - _IN VSI_NN_implicit_padding_e implicit_padding - ); - -/** - * Convolution 2D node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] kernel Kernel with a 4D tensor. - * @param[in] bias Bias with a 1D tensor. - * @param[in] output Node output tensor. - * @param[in] stride_h Convolution stride height. - * @param[in] stride_w Convolution stride width. - * @param[in] dilation_h Convolution height dilation rate. - * @param[in] dilation_w Convolution width dilation rate. - * @param[in] pad_h_front Padding height front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_h_end Padding height front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_w_front Padding width front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_w_end Padding widht front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_conv_2d - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* kernel, - _IN _OPTIONAL VSI_NN_tensor* bias, - _IN VSI_NN_tensor* output, - _IN int32_t stride_h, _IN int32_t stride_w, - _IN int32_t dilation_h, _IN int32_t dilation_w, - _IN int32_t pad_h_front, _IN int32_t pad_h_end, - _IN int32_t pad_w_front, _IN int32_t pad_w_end, - _IN VSI_NN_implicit_padding_e implicit_padding - ); - -/** - * Depthwise Convolution 2D node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] kernel Kernel with a 4D tensor. - * @param[in] bias Bias with a 1D tensor. - * @param[in] output Node output tensor. - * @param[in] multiplier Depthwise convolution multiplier. - * @param[in] stride_h Convolution stride height. - * @param[in] stride_w Convolution stride width. - * @param[in] dilation_h Convolution height dilation rate. - * @param[in] dilation_w Convolution width dilation rate. - * @param[in] pad_h_front Padding height front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_h_end Padding height front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_w_front Padding width front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_w_end Padding widht front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_depthwise_conv_2d - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* kernel, - _IN _OPTIONAL VSI_NN_tensor* bias, - _IN VSI_NN_tensor* output, - _IN int32_t multiplier, - _IN int32_t stride_h, _IN int32_t stride_w, - _IN int32_t dilation_h, _IN int32_t dilation_w, - _IN int32_t pad_h_front, _IN int32_t pad_h_end, - _IN int32_t pad_w_front, _IN int32_t pad_w_end, - _IN VSI_NN_implicit_padding_e implicit_padding - ); - -/** - * Grouped Convolution 2D node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] kernel Kernel with a 4D tensor. - * @param[in] bias Bias with a 1D tensor. - * @param[in] output Node output tensor. - * @param[in] group_number Group number for the convolution. - * @param[in] stride_h Convolution stride height. - * @param[in] stride_w Convolution stride width. - * @param[in] dilation_h Convolution height dilation rate. - * @param[in] dilation_w Convolution width dilation rate. - * @param[in] pad_h_front Padding height front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_h_end Padding height front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_w_front Padding width front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] pad_w_end Padding widht front value, - * this field only effect when implicit - * padding is VSI_NN_IMPLICIT_PADDING_NONE. - * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_grouped_conv_2d - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* kernel, - _IN _OPTIONAL VSI_NN_tensor* bias, - _IN VSI_NN_tensor* output, - _IN int32_t group_number, - _IN int32_t stride_h, _IN int32_t stride_w, - _IN int32_t dilation_h, _IN int32_t dilation_w, - _IN int32_t pad_h_front, _IN int32_t pad_h_end, - _IN int32_t pad_w_front, _IN int32_t pad_w_end, - _IN VSI_NN_implicit_padding_e implicit_padding - ); - -EXPORT VSI_NN_error_e VSI_NN_node_transposed_conv_2d - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* kernel, - _IN _OPTIONAL VSI_NN_tensor* bias, - _IN VSI_NN_tensor* output, - _IN int32_t stride_h, _IN int32_t stride_w, - _IN int32_t dilation_h, _IN int32_t dilation_w, - _IN int32_t pad_h_front, _IN int32_t pad_h_end, - _IN int32_t pad_w_front, _IN int32_t pad_w_end, - _IN int32_t output_pad_h, _IN int32_t output_pad_w - ); - -/** Pooling */ -EXPORT VSI_NN_error_e VSI_NN_node_average_pool_2d - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t ksize_h, _IN int32_t ksize_w, - _IN int32_t stride_h, _IN int32_t stride_w, - _IN int32_t pad_h_front, _IN int32_t pad_h_end, - _IN int32_t pad_w_front, _IN int32_t pad_w_end, - _IN VSI_NN_implicit_padding_e implicit_padding, - _IN VSI_NN_rounding_e size_rounding - ); - -EXPORT VSI_NN_error_e VSI_NN_node_max_pool_2d - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t ksize_h, _IN int32_t ksize_w, - _IN int32_t stride_h, _IN int32_t stride_w, - _IN int32_t pad_h_front, _IN int32_t pad_h_end, - _IN int32_t pad_w_front, _IN int32_t pad_w_end, - _IN VSI_NN_implicit_padding_e implicit_padding, - _IN VSI_NN_rounding_e size_rounding - ); - -EXPORT VSI_NN_error_e VSI_NN_node_l2_pool_2d - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t ksize_h, _IN int32_t ksize_w, - _IN int32_t stride_h, _IN int32_t stride_w, - _IN int32_t pad_h_front, _IN int32_t pad_h_end, - _IN int32_t pad_w_front, _IN int32_t pad_w_end, - _IN VSI_NN_implicit_padding_e implicit_padding, - _IN VSI_NN_rounding_e size_rounding - ); - -EXPORT VSI_NN_error_e VSI_NN_node_unpool_2d(); - -/** Normalization */ -EXPORT VSI_NN_error_e VSI_NN_node_batch_normalization - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* mean, - _IN VSI_NN_tensor* variance, - _IN VSI_NN_tensor* offset, - _IN VSI_NN_tensor* scale, - _IN VSI_NN_tensor* output, - _IN float variance_epsilon - ); - -/** - * L2 Normalization node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * @param[in] axis Normalize axis. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_l2_normalization - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t axis - ); - -EXPORT VSI_NN_error_e VSI_NN_node_local_response_normalization - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t depth_radius, - _IN float bias, - _IN float alpha, - _IN float beta, - _IN int32_t axis - ); - -EXPORT VSI_NN_error_e VSI_NN_node_instance_normalization - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* offset, - _IN VSI_NN_tensor* scale, - _IN VSI_NN_tensor* output, - _IN float variance_epsilon - ); - -/** Math */ -/** - * Add node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_add - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Multiply node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_mul - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Divide node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_div - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Subtract node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_sub - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Floor node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_floor - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Square node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_square - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Sqrt node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_sqrt - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Rsqrt node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_rsqrt - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Matmul node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * @param[in] transpose_input1 Whether to do transpose on input1. - * @param[in] transpose_input2 Whether to do transpose on input2. - * @param[in] transpose_output Whether to do transpose on output. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_matmul - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output, - _IN int transpose_input1, - _IN int transpose_input2, - _IN int transpose_output - ); - -/** - * Abs node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_abs - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Pow node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_pow - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Maximum node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_maximum - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Minimum node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_minimum - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Exp node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_exp - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Reverse node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * @param[in] axes Axes to reverse. - * @param[in] axes_size Number of axis to reverse. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_reverse - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* axes, - _IN int32_t axes_size - ); - -/** - * Transpose node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * @param[in] perm Transpose order. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_transpose - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* perm - ); - -EXPORT VSI_NN_error_e VSI_NN_node_gather - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* indices, - _IN VSI_NN_tensor* output, - _IN int32_t axis - ); - -/** - * Neg node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_neg - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Reduce max node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * @param[in] axes Axes to reduce. - * @param[in] axes_size Number of axis to reduce. - * @param[in] keep_dim Whether to keep dims on output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_reduce_max - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* axes, - _IN int32_t axes_size, - _IN int32_t keep_dim - ); - -/** - * Reduce min node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * @param[in] axes Axes to reduce. - * @param[in] axes_size Number of axis to reduce. - * @param[in] keep_dim Whether to keep dims on output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_reduce_min - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* axes, - _IN int32_t axes_size, - _IN int32_t keep_dim - ); - -/** - * Reduce sum node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * @param[in] axes Axes to reduce. - * @param[in] axes_size Number of axis to reduce. - * @param[in] keep_dim Whether to keep dims on output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_reduce_sum - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* axes, - _IN int32_t axes_size, - _IN int32_t keep_dim - ); - -/** - * Reduce mean node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * @param[in] axes Axes to reduce. - * @param[in] axes_size Number of axis to reduce. - * @param[in] keep_dim Whether to keep dims on output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_reduce_mean - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* axes, - _IN int32_t axes_size, - _IN int32_t keep_dim - ); - -/** - * Sin node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_sin - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -EXPORT VSI_NN_error_e VSI_NN_node_tile - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* multiples, - _IN int32_t multiples_size - ); - -EXPORT VSI_NN_error_e VSI_NN_node_topk - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN VSI_NN_tensor* output_indices, - _IN int32_t k - ); - -/** Logical */ -/** - * Equal node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_equal - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Greater node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_greater - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Greater equal node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_greater_equal - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Less node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_less - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Less equal node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_less_equal - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Logical and node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_logical_and - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Logical or node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_logical_or - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Logical not node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_logical_not - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Not equal node. - * - * @param[in] graph Graph to create the node. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_not_equal - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** - * Select node. - * If conditon is true, then output input1 tensor, - * else output input2 tensor. - * - * @param[in] graph Graph to create the node. - * @param[in] condition Conditon tensor.. - * @param[in] input1 Node input tensor. - * @param[in] input2 Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_select - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* condition, - _IN VSI_NN_tensor* input1, - _IN VSI_NN_tensor* input2, - _IN VSI_NN_tensor* output - ); - -/** Activation */ -/** - * relu node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_relu - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * ReLU1 node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_relu1 - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * ReLU6 node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_relu6 - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -EXPORT VSI_NN_error_e VSI_NN_node_tanh - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN float scale_a, - _IN float scale_b - ); - -/** - * Sigmoid node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_sigmoid - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Hard sigmoid node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_hard_sigmoid - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Mish node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_mish - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -EXPORT VSI_NN_error_e VSI_NN_node_leaky_relu - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN float ratio - ); - -EXPORT VSI_NN_error_e VSI_NN_node_prelu - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* alpha, - _IN VSI_NN_tensor* output - ); - -/** - * Soft relu node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_soft_relu - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Elu node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_elu - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** Misc */ -EXPORT VSI_NN_error_e VSI_NN_node_pad - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN VSI_NN_padding_mode_e mode, - _IN const int32_t* pad_front, - _IN const int32_t* pad_end, - _IN int32_t pad_value - ); - -EXPORT VSI_NN_error_e VSI_NN_node_fully_connected - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* kernel, - _IN _OPTIONAL VSI_NN_tensor* bias, - _IN VSI_NN_tensor* output, - _IN int32_t axis - ); - -EXPORT VSI_NN_error_e VSI_NN_node_concate - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* const inputs[], - _IN int32_t input_num, - _IN VSI_NN_tensor* output, - _IN int32_t axis - ); - -EXPORT VSI_NN_error_e VSI_NN_node_split - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* const outputs[], - _IN int32_t output_num, - _IN const int32_t* slices, - _IN int32_t slices_size, - _IN int32_t axis - ); - -/** - * Cast node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_cast - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Quantize node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_quantize - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -/** - * Dequantize node. - * - * @param[in] graph Graph to create the node. - * @param[in] input Node input tensor. - * @param[in] output Node output tensor. - * - * @return VSI_NN_ERROR_OK on success. - */ -EXPORT VSI_NN_error_e VSI_NN_node_dequantize - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output - ); - -EXPORT VSI_NN_error_e VSI_NN_node_space_to_batch - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* block_size, - _IN int32_t block_size_num, - _IN const int32_t* pad_front, - _IN const int32_t* pad_end - ); - -EXPORT VSI_NN_error_e VSI_NN_node_batch_to_space - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* block_size, - _IN int32_t block_size_num, - _IN const int32_t* crop_front, - _IN const int32_t* crop_end - ); - -EXPORT VSI_NN_error_e VSI_NN_node_space_to_depth - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* block_size, - _IN int32_t block_size_num, - _IN const int32_t* pad_front, - _IN const int32_t* pad_end - ); - -EXPORT VSI_NN_error_e VSI_NN_node_depth_to_space - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* block_size, - _IN int32_t block_size_num, - _IN const int32_t* crop_front, - _IN const int32_t* crop_end - ); - -EXPORT VSI_NN_error_e VSI_NN_node_channel_shuffle - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t group_number, - _IN int32_t axis - ); - -EXPORT VSI_NN_error_e VSI_NN_node_expand_dims - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t axis - ); - -EXPORT VSI_NN_error_e VSI_NN_node_hashtable_lookup - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* lookups, - _IN VSI_NN_tensor* keys, - _IN VSI_NN_tensor* values, - _IN VSI_NN_tensor* output, - _IN VSI_NN_tensor* output_hits - ); - -EXPORT VSI_NN_error_e VSI_NN_node_embedding_lookup - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* lookups, - _IN VSI_NN_tensor* values, - _IN VSI_NN_tensor* output - ); - -EXPORT VSI_NN_error_e VSI_NN_node_lsh_projection - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* hash_func, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* weight, - _IN VSI_NN_tensor* output, - _IN VSI_NN_lsh_projection_type_e type - ); - -EXPORT VSI_NN_error_e VSI_NN_node_slice - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* begin, - _IN const int32_t* size - ); - -EXPORT VSI_NN_error_e VSI_NN_node_strided_slice - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN const int32_t* begin, - _IN const int32_t* end, - _IN const int32_t* strides, - _IN int32_t begin_mask, - _IN int32_t end_mask, - _IN int32_t shrink_axis_mask - ); - -EXPORT VSI_NN_error_e VSI_NN_node_argmax - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t axis - ); - -EXPORT VSI_NN_error_e VSI_NN_node_argmin - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t axis - ); - -/** Detection */ -EXPORT VSI_NN_error_e VSI_NN_node_roi_pool - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* feature_map, - _IN VSI_NN_tensor* loc, - _IN VSI_NN_tensor* batch_index, - _IN VSI_NN_tensor* output, - _IN int32_t output_h, - _IN int32_t output_w, - _IN float ratio_h, - _IN float ratio_w - ); - -EXPORT VSI_NN_error_e VSI_NN_node_roi_align - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* feature_map, - _IN VSI_NN_tensor* loc, - _IN VSI_NN_tensor* batch_index, - _IN VSI_NN_tensor* output, - _IN int32_t output_h, - _IN int32_t output_w, - _IN float ratio_h, - _IN float ratio_w, - _IN int32_t sample_num_h, - _IN int32_t sample_num_w - ); - -/** Image transform */ -EXPORT VSI_NN_error_e VSI_NN_node_resize_bilinear - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t scale_h, - _IN int32_t scale_w - ); - -EXPORT VSI_NN_error_e VSI_NN_node_resize_nearest - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* output, - _IN int32_t scale_h, - _IN int32_t scale_w - ); - -/** RNN */ -EXPORT VSI_NN_error_e VSI_NN_node_svdf - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* weights_feature, - _IN VSI_NN_tensor* weights_time, - _IN VSI_NN_tensor* bias, - _IN VSI_NN_tensor* input_state, - _IN VSI_NN_tensor* output, - _IN VSI_NN_tensor* output_state, - _IN int32_t rank - ); - -//EXPORT VSI_NN_error_e VSI_NN_node_rnn(); - -EXPORT VSI_NN_error_e VSI_NN_node_rnn_unit - ( - _IN VSI_NN_graph* graph, - _IN VSI_NN_tensor* input, - _IN VSI_NN_tensor* input_state, - _IN VSI_NN_tensor* weight, _IN VSI_NN_tensor* recrrent_weight, - _IN VSI_NN_tensor* bias, - _IN VSI_NN_tensor* output, - _IN VSI_NN_tensor* output_state, - _IN VSI_NN_activation_e activation - ); - -EXPORT VSI_NN_error_e VSI_NN_node_lstm_unit - ( - _IN VSI_NN_graph* graph - ); - -__END_DECLS -#endif diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index b426e4b..d10a29b 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -61,14 +61,13 @@ typedef struct _vsi_nn_hw_config_t { char target_name[VSI_NN_MAX_TARGET_NAME]; vsi_nn_hw_evis_t evis; -#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT uint32_t subGroupSize; -#endif uint32_t use_40bits_va; uint32_t support_stream_processor; uint32_t sp_exec_count; uint32_t sp_vector_depth; uint32_t sp_per_core_vector_depth; + uint32_t support_ffd; } vsi_nn_hw_config_t; typedef struct _vsi_nn_runtime_option_t @@ -89,6 +88,7 @@ typedef struct _vsi_nn_runtime_option_t int32_t enable_save_file_type; int32_t enable_use_image_process; int32_t enable_use_from_handle; + vsi_nn_hw_config_t config; } vsi_nn_runtime_option_t; /** @@ -101,6 +101,15 @@ typedef struct _vsi_nn_context_t vsi_nn_runtime_option_t options; } VSI_PUBLIC_TYPE *vsi_nn_context_t; +/** + * Query and set options->config hw params. + */ +OVXLIB_API vsi_status query_hardware_caps_runtime + ( + vsi_nn_context_t ctx, + vsi_nn_runtime_option_t *options + ); + /** * Create context * Create ovxlib NN runtime context. @@ -113,6 +122,11 @@ OVXLIB_API vsi_status vsi_nn_initOptions ( vsi_nn_runtime_option_t *options ); +OVXLIB_API vsi_status vsi_nn_initOptions_runtime + ( + vsi_nn_runtime_option_t *options, + vsi_nn_context_t ctx + ); /** * Release context * Release ovxlib NN runtime resource and reset context handle to NULL. diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h index b70b1dc..66361bd 100644 --- a/src/tim/vx/internal/include/vsi_nn_feature_config.h +++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h @@ -57,5 +57,8 @@ #define VSI_PER_GROUP_QUANTIZATION_SUPPORT #endif #define VSI_GRAPH_RUNTIME_ENV_SUPPORT +#if defined(VX_TENSOR_SPARSITY_SUPPORT) +#define VSI_TENSOR_SPARSITY_SUPPORT +#endif #endif diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index dc82aeb..ddb21ef 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -216,6 +216,7 @@ #include "ops/vsi_nn_op_grouped_conv3d.h" #include "ops/vsi_nn_op_col2im.h" #include "ops/vsi_nn_op_l1_layer_norm.h" +#include "ops/vsi_nn_op_rope.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" #include "ops/vsi_nn_op_inverse_sigmoid.h" @@ -420,6 +421,7 @@ typedef union _vsi_nn_nn_param vsi_nn_grouped_conv3d_param grouped_conv3d; vsi_nn_col2im_param col2im; vsi_nn_l1_layer_norm_param l1_layer_norm; + vsi_nn_rope_param rope; void* client_param; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h index 90dcb22..2efb763 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor.h @@ -86,8 +86,10 @@ typedef enum VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6, /** perchannel float8 */ VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7, - /** GPQT */ + /** pergroup symmetric */ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8, + /** pergroup asymmetric */ + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC = 0x9, /** undefined type */ VSI_NN_QNT_TYPE_NA = 0xff, } vsi_nn_qnt_type_e; diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h index 4c88f95..9a0acca 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h @@ -418,6 +418,34 @@ OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar int8_t is_scalar ); +/** + * Get Tensor is_scalar + * Get the is_sparsity of the tensor + * + * @param[in] tensor Tensor. + * + * @return is_sparsity flag of the tensor. + */ +OVXLIB_API int32_t vsi_nn_GetTensorIsSparsity +( + vsi_nn_tensor_t* tensor +); + +/** + * Set Weight Tensor whether is sparsity + * Set the is_sparsity for the tensor + * + * @param[in] tensor Tensor. + * @param[in] new is_sparsity value of the tensor. + * + * @return VSI_SUCCESS on success, or error core otherwise. +**/ + +OVXLIB_API vsi_status vsi_nn_SetTensorIsSparsity( + vsi_nn_tensor_t* tensor, + int32_t is_sparsity +); + OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor ( vsi_nn_graph_t* graph, diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 37368a4..30d0adb 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 2 -#define VSI_NN_VERSION_PATCH 14 +#define VSI_NN_VERSION_PATCH 22 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c new file mode 100644 index 0000000..67a0833 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_letterbox_evis.c @@ -0,0 +1,475 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +#define _CUSTOM_LETTERBOX_KERNEL_SOURCE "custom_letterbox" + +// Add kernel hashtable here +#define CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE ) | ( OUT_DTYPE << 8 )) +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("evis.custom_letterbox_"#IN_DTYPE"to"#OUT_DTYPE), \ + _CUSTOM_LETTERBOX_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _custom_letterbox_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, U8 ), + PACK_KERNEL_MAP( U8, I8 ), + PACK_KERNEL_MAP( U8, F16 ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _custom_letterbox_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _CUSTOM_LETTERBOX_PARAM_NUM _cnt_of_array( _custom_letterbox_kernel_param_def ) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_custom_letterbox_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + VSI_UNREFERENCED(param_size); + int32_t top = 0; + int32_t bottom = 0; + int32_t left = 0; + int32_t right = 0; + float scale_w = 0; + float scale_h = 0; + int32_t resize_w = 0; + int32_t resize_h = 0; + int32_t resize_max_w = 0; + int32_t resize_max_h = 0; + float output_scale = 1.0f; + float output_zp = 0; + float out_scale_r = 0; + float out_zp_r = 0; + float out_scale_g = 0; + float out_zp_g = 0; + float out_scale_b = 0; + float out_zp_b = 0; + float pad_v_r = 0; + float pad_v_g = 0; + float pad_v_b = 0; + int32_t in_width = 0; + int32_t in_height = 0; + int32_t out_width = 0; + int32_t out_height = 0; + float mean_r = 0; + float mean_g = 0; + float mean_b = 0; + float scale_r = 0; + float scale_g = 0; + float scale_b = 0; + vx_int32 pad_value_r = 0; + vx_int32 pad_value_g = 0; + vx_int32 pad_value_b = 0; + vx_int32 r_order = 0; + vx_int32 b_order = 0; + vx_int32 reverse_channel = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &top); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &bottom); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &left); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &right); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &mean_r); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &mean_g); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &mean_b); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &scale_r); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &scale_g); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[11], &scale_b); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_value_r); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_value_g); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &pad_value_b); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &reverse_channel); + CHECK_STATUS_FAIL_GOTO(status, final ); + + in_width = (int32_t)attr[0]->shape->data[0] / 3; + in_height = (int32_t)attr[0]->shape->data[1]; + out_width = (int32_t)attr[1]->shape->data[0]; + out_height = (int32_t)attr[1]->shape->data[1] / 3; + + output_scale = 1.0f / attr[1]->scale; + output_zp = (float)(attr[1]->zero_point); + + resize_w = out_width - left - right; + resize_h = out_height - top - bottom; + resize_max_w = out_width - right; + resize_max_h = out_height - bottom; + scale_w = (float)in_width / resize_w; + scale_h = (float)in_height / resize_h; + out_scale_r = scale_r / output_scale; + out_zp_r = output_zp - out_scale_r * mean_r; + out_scale_g = scale_g / output_scale; + out_zp_g = output_zp - out_scale_g * mean_g; + out_scale_b = scale_b / output_scale; + out_zp_b = output_zp - out_scale_b * mean_b; + pad_v_r = pad_value_r * out_scale_r + out_zp_r; + pad_v_g = pad_value_g * out_scale_g + out_zp_g; + pad_v_b = pad_value_b * out_scale_b + out_zp_b; + + if (reverse_channel) + { + r_order = out_height * 2; + b_order = 0; + } + else + { + r_order = 0; + b_order = out_height * 2; + } + + { + gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{ + 0x00090909, // TCfg + 0x00000000, // ASelt + 0x00140003, 0x00000025, // ABin + 0x000a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniLeftToFloat32_4x4 = {{ + 0x00010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00000002, // ABin + 0x00020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtactHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtract8Data_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFloat32_4x4", &uniLeftToFloat32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8 ); + } + status |= vsi_nn_kernel_gpu_add_param( node, "top", &top ); + status |= vsi_nn_kernel_gpu_add_param( node, "left", &left ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_r", &out_scale_r ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_g", &out_scale_g ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_b", &out_scale_b ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_r", &out_zp_r ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_g", &out_zp_g ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_b", &out_zp_b ); + status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_r", &pad_v_r ); + status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_g", &pad_v_g ); + status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_b", &pad_v_b ); + status |= vsi_nn_kernel_gpu_add_param( node, "scale_w", &scale_w ); + status |= vsi_nn_kernel_gpu_add_param( node, "scale_h", &scale_h ); + status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_w", &resize_max_w ); + status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_h", &resize_max_h ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height ); + status |= vsi_nn_kernel_gpu_add_param( node, "r_order", &r_order ); + status |= vsi_nn_kernel_gpu_add_param( node, "b_order", &b_order ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = out_width; + gpu_param.global_size[1] = out_height; + + status |= vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _custom_warp_affine_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _custom_letterbox_kernel_map; + size_t kernel_map_size = _cnt_of_array( _custom_letterbox_kernel_map ); + vx_param_description_t * param_def = _custom_letterbox_kernel_param_def; + size_t param_def_size = _cnt_of_array( _custom_letterbox_kernel_param_def ); + vx_kernel_initialize_f initializer = _custom_letterbox_initializer; + uint32_t key = 0; + uint32_t i = 0; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = CUSTOM_LETTERBOX_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (vx_uint32)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CUSTOM_LETTERBOX_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + size_t i = 0; + + int32_t top = vsi_nn_kernel_param_get_int32( params, "top"); + int32_t bottom = vsi_nn_kernel_param_get_int32( params, "bottom"); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left"); + int32_t right = vsi_nn_kernel_param_get_int32( params, "right"); + float mean_r = vsi_nn_kernel_param_get_float32( params, "mean_r"); + float mean_g = vsi_nn_kernel_param_get_float32( params, "mean_g"); + float mean_b = vsi_nn_kernel_param_get_float32( params, "mean_b"); + float scale_r = vsi_nn_kernel_param_get_float32( params, "scale_r"); + float scale_g = vsi_nn_kernel_param_get_float32( params, "scale_g"); + float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b"); + int32_t pad_value_r = vsi_nn_kernel_param_get_int32( params, "pad_value_r"); + int32_t pad_value_g = vsi_nn_kernel_param_get_int32( params, "pad_value_g"); + int32_t pad_value_b = vsi_nn_kernel_param_get_int32( params, "pad_value_b"); + int32_t reverse_channel = vsi_nn_kernel_param_get_int32( params, "reverse_channel"); + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + + uint32_t param_num = _CUSTOM_LETTERBOX_PARAM_NUM; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + shapes[0][0] = inputs[0]->attr.size[1] * 3; + shapes[0][1] = inputs[0]->attr.size[2]; + shapes[1][0] = outputs[0]->attr.size[0]; + shapes[1][1] = outputs[0]->attr.size[1] * 3; + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], 2 ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[1], 2 ); + + if (reshape_tensors[0] == NULL || + reshape_tensors[1] == NULL) + { + goto final; + } + + if (reverse_channel) + { + float mean_temp = mean_r; + float scale_temp = scale_r; + int32_t pad_value_temp = pad_value_r; + mean_r = mean_b; + mean_b = mean_temp; + scale_r = scale_b; + scale_b = scale_temp; + pad_value_r = pad_value_b; + pad_value_b = pad_value_temp; + } + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 2; + + vsi_nn_kernel_node_pack_io( node_params, param_num, + reshape_tensors, 1, &reshape_tensors[1], 1 ); + + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bottom ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &right ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_r ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_g ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_b ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_r ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_g ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_r ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_g ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_b ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse_channel ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, param_num ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + vsi_nn_kernel_scalar_release( &node_params[13] ); + vsi_nn_kernel_scalar_release( &node_params[14] ); + vsi_nn_kernel_scalar_release( &node_params[15] ); + + CHECK_STATUS(status); + } + } + +final: + for (i = 0; i < 2; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( custom_letterbox, _setup ) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c index 6dc60ce..7889891 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c @@ -35,6 +35,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #include "libnnext/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" #define _CPU_ARG_NUM (1) #define _CPU_INPUT_NUM (1) @@ -42,6 +43,7 @@ #define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) #define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) #define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC") +#define _KERNEL_NAME_U8 ("com.vivantecorp.extension.Softmax2VXC_u8") #define SCALAR_INPUT_AXIS (2) @@ -64,7 +66,11 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer) { vsi_status status = VSI_FAILURE; int sf_size = 0; - vsi_nn_kernel_tensor_attr_t* attr = NULL; + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + float srcZP = 0.0f; + float srcScale = 1.0f; + float dstZP = 0.0f; + float dstScale = 1.0f; // Alignment with a power of two value. gpu_param_t gpu_param = { 2, // workdim @@ -75,14 +81,19 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer) VSI_UNREFERENCED(param_size); - attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); - if (!attr) + attr[0] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]); + attr[1] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]); + if ((!attr[0]) || (!attr[1])) { VSILOGE("Query failure! at line"); return status; } - sf_size = (int)attr->shape->data[0]; + sf_size = (int)attr[0]->shape->data[0]; + srcScale = attr[0]->scale; + srcZP = (float)attr[0]->zero_point; + dstScale = 1.0f / attr[1]->scale; + dstZP = (float)attr[1]->zero_point; gpu_param.global_offset[0] = 0; gpu_param.global_offset[1] = 0; @@ -91,7 +102,7 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer) gpu_param.local_size[0] = 1; gpu_param.local_size[1] = 1; gpu_param.global_size[0] = - gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], + gpu_align_p2((attr[0]->shape->data[1] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], gpu_param.local_size[0]); gpu_param.global_size[1] = gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1], @@ -107,25 +118,50 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer) 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtract8Bin_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; status = vsi_nn_kernel_gpu_add_param( node, "Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 ); - vsi_nn_kernel_gpu_add_param(node, + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Bin_2x8", &uniExtract8Bin_2x8 ); + status |= vsi_nn_kernel_gpu_add_param(node, "sf_size", &sf_size); + status |= vsi_nn_kernel_gpu_add_param(node, "srcScale", &srcScale); + status |= vsi_nn_kernel_gpu_add_param(node, "srcZP", &srcZP); + status |= vsi_nn_kernel_gpu_add_param(node, "dstScale", &dstScale); + status |= vsi_nn_kernel_gpu_add_param(node, "dstZP", &dstZP); } - status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + status |= vsi_nn_kernel_gpu_config( node, &gpu_param ); if(status != VSI_SUCCESS) { VSILOGE("Initializer failure!"); } - if (attr) vsi_nn_kernel_tensor_attr_release( &attr ); + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } return status; } -static const vx_kernel_description_t _kernel_info = +static const vx_kernel_description_t _kernel_info1 = { KERNEL_ID_PLACEHOLDER, _KERNEL_NAME, @@ -139,6 +175,20 @@ static const vx_kernel_description_t _kernel_info = vsi_nn_KernelDeinitializer }; +static const vx_kernel_description_t _kernel_info2 = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME_U8, + NULL, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + _softmax_initializer, + vsi_nn_KernelDeinitializer +}; + static vsi_status _query_kernel ( vsi_nn_tensor_t* const* const inputs, @@ -146,9 +196,20 @@ static vsi_status _query_kernel vsi_nn_kernel_t* kernel ) { - VSI_UNREFERENCED(inputs); - VSI_UNREFERENCED(outputs); - memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + + in_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type); + out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type); + + if (in_dtype == U8 && out_dtype == U8) + { + memmove( &kernel->info, &_kernel_info2, sizeof(vx_kernel_description_t) ); + } + else + { + memmove( &kernel->info, &_kernel_info1, sizeof(vx_kernel_description_t) ); + } vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", @@ -173,12 +234,42 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; int32_t axis = 0; + vsi_nn_tensor_t* reshape_tensors[2] = {NULL}; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + uint32_t rank_in = 0; + int32_t new_axis = 0; + uint32_t i = 0; + vsi_bool ret = vx_false_e; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); axis = vsi_nn_kernel_param_get_int32(params, "axis"); + ret = vsi_nn_kernel_optimize_softmax_shape(inputs[0]->attr.size, + inputs[0]->attr.dim_num, + axis, + shapes[0], + &rank_in, + &new_axis); + + if (ret) + { + reshape_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], rank_in); + reshape_tensors[1] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[0], rank_in); + } + else + { + return NULL; + } + + if (!vsi_nn_kernel_gpu_check_shape(reshape_tensors[0]->attr.size, + reshape_tensors[0]->attr.dim_num) || + new_axis > 2) + { + return NULL; + } + status = _query_kernel( inputs, outputs, kernel ); if( VSI_SUCCESS == status) { @@ -187,9 +278,9 @@ static vsi_nn_kernel_node_t _setup { /* Set inputs and outputs */ vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, - inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + reshape_tensors, _CPU_INPUT_NUM, &reshape_tensors[1], _CPU_OUTPUT_NUM ); backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( - graph, I32, &axis ); + graph, I32, &new_axis ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); @@ -200,6 +291,11 @@ static vsi_nn_kernel_node_t _setup status = VSI_FAILURE; } } + + for (i = 0; i < 2; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c new file mode 100644 index 0000000..6567838 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_letterbox.c @@ -0,0 +1,227 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _custom_letterbox_local_data_t { + int32_t placeholder; +} custom_letterbox_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +int32_t my_round(float in) +{ + if (in >= 0) + { + return (int)(in + 0.5f); + } + else + { + return (int)(in - 0.5f); + } +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_custom_letterbox_param * p; + p = &(self->nn_param.custom_letterbox); + int32_t shape_w = (int32_t)inputs[0]->attr.size[1]; + int32_t shape_h = (int32_t)inputs[0]->attr.size[2]; + int32_t new_shape_w = (int32_t)outputs[0]->attr.size[0]; + int32_t new_shape_h = (int32_t)outputs[0]->attr.size[1]; + vx_bool auto_bool = p->auto_bool; + vx_bool scaleFill = p->scaleFill; + vx_bool scaleup = p->scaleup; + int32_t stride = p->stride; + vx_bool center = p->center; + + float r = 1.0f; + int32_t new_unpad_w = 0; + int32_t new_unpad_h = 0; + int32_t dw = 0; + int32_t dh = 0; + int32_t top = 0; + int32_t bottom = 0; + int32_t left = 0; + int32_t right = 0; + + r = (float)fmin((float)new_shape_w / shape_w, (float)new_shape_h / shape_h); + if (!scaleup) + { + r = (float)fmin(r, 1.0f); + } + + new_unpad_w = my_round(r * shape_w); + new_unpad_h = my_round(r * shape_h); + dw = new_shape_w - new_unpad_w; + dh = new_shape_h - new_unpad_h; + if (auto_bool) + { + dw = dw % stride; + dh = dh % stride; + } + else if (scaleFill) + { + dw = 0; + dh = 0; + new_unpad_w = new_shape_w; + new_unpad_h = new_shape_h; + } + if (center) + { + top = my_round(dh / 2.0f - 0.1f); + bottom = my_round(dh / 2.0f + 0.1f); + left = my_round(dw / 2.0f - 0.1f); + right = my_round(dw / 2.0f + 0.1f); + } + else + { + top = 0; + bottom = my_round(dh + 0.1f); + left = 0; + right = my_round(dw + 0.1f); + } + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "top", top); + vsi_nn_kernel_param_add_int32( param, "bottom", bottom); + vsi_nn_kernel_param_add_int32( param, "left", left); + vsi_nn_kernel_param_add_int32( param, "right", right); + vsi_nn_kernel_param_add_float32( param, "mean_r", p->mean_r); + vsi_nn_kernel_param_add_float32( param, "mean_g", p->mean_g); + vsi_nn_kernel_param_add_float32( param, "mean_b", p->mean_b); + vsi_nn_kernel_param_add_float32( param, "scale_r", p->scale_r); + vsi_nn_kernel_param_add_float32( param, "scale_g", p->scale_g); + vsi_nn_kernel_param_add_float32( param, "scale_b", p->scale_b); + vsi_nn_kernel_param_add_int32( param, "pad_value_r", p->pad_value_r); + vsi_nn_kernel_param_add_int32( param, "pad_value_g", p->pad_value_g); + vsi_nn_kernel_param_add_int32( param, "pad_value_b", p->pad_value_b); + vsi_nn_kernel_param_add_int32( param, "reverse_channel", p->reverse_channel); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "custom_letterbox", + inputs, 1, + outputs, 1, param ); + + vsi_nn_kernel_param_release( ¶m ); + + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(LETTERBOX, 1, 1) + IO_TYPE(D_U8, D_F16) + IO_TYPE(D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_I8|Q_ASYM) + IO_TYPE(D_U8, D_I8|Q_SYM) + END_IO_TYPE_DECL(LETTERBOX) + if (!VALIDATE_OP_IO_TYPES(LETTERBOX, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + outputs[0]->attr.size[0] = self->nn_param.custom_letterbox.new_shape_w; + outputs[0]->attr.size[1] = self->nn_param.custom_letterbox.new_shape_h; + outputs[0]->attr.size[2] = 3; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CUSTOM_LETTERBOX, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c index 50c435b..bcde042 100644 --- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c @@ -85,18 +85,24 @@ static const struct { HASH_CUMSUM_KERNELS(0, U8, U8) HASH_CUMSUM_KERNELS(0, F32, F32) HASH_CUMSUM_KERNELS(0, F32, U8) + HASH_CUMSUM_KERNELS(0, I32, I32) HASH_CUMSUM_KERNELS(1, U8, U8) HASH_CUMSUM_KERNELS(1, F32, F32) HASH_CUMSUM_KERNELS(1, F32, U8) + HASH_CUMSUM_KERNELS(1, I32, I32) HASH_CUMSUM_KERNELS(2, U8, U8) HASH_CUMSUM_KERNELS(2, F32, F32) HASH_CUMSUM_KERNELS(2, F32, U8) + HASH_CUMSUM_KERNELS(2, I32, I32) + HASH_CUMSUM_KERNELS_2D(0, U8, U8) HASH_CUMSUM_KERNELS_2D(0, F32, F32) HASH_CUMSUM_KERNELS_2D(0, F32, U8) + HASH_CUMSUM_KERNELS_2D(0, I32, I32) HASH_CUMSUM_KERNELS_2D(1, U8, U8) HASH_CUMSUM_KERNELS_2D(1, F32, F32) HASH_CUMSUM_KERNELS_2D(1, F32, U8) + HASH_CUMSUM_KERNELS_2D(1, I32, I32) HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_3) HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3) diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c index f139ccb..44c14d6 100644 --- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c @@ -26,6 +26,7 @@ #include #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_tensor.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" @@ -644,7 +645,8 @@ static vsi_nn_kernel_node_t _setup #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT shader_cnt_support = - (graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE; + (((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >= 64 && + ((vsi_nn_graph_prv_t*)graph)->options->config.use_40bits_va) ? TRUE : FALSE; #endif if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support) { diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c index eb0e556..60ab16b 100644 --- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c @@ -75,6 +75,7 @@ static const _kernel_map_type _one_hot_kernel_map[] = PACK_ONE_HOT_KERNEL_MAP( F32, F32 ), PACK_ONE_HOT_KERNEL_MAP( I32, I32 ), PACK_ONE_HOT_KERNEL_MAP( I32, F32 ), + PACK_ONE_HOT_KERNEL_MAP( I32, BF16 ), PACK_ONE_HOT_KERNEL_MAP( I32, U8 ), PACK_ONE_HOT_KERNEL_MAP( U8, U8 ), }; diff --git a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c index 87c8593..1f0d4a9 100644 --- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c @@ -79,7 +79,7 @@ static const struct { const char* source_name; } kernel_map[] = { - PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) + PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1) PRELU_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1) PRELU_KERNELS(U8, U8, U8, KERNEL_SOURCE_1) PRELU_KERNELS(I32, I32, I32, KERNEL_SOURCE_1) diff --git a/src/tim/vx/internal/src/kernel/cl/rope_cl.c b/src/tim/vx/internal/src/kernel/cl/rope_cl.c new file mode 100644 index 0000000..90c60c3 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/rope_cl.c @@ -0,0 +1,329 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_ROPE, +} _internal_kernel_e; + +#define _ROPE_KERNEL_SOURCE "rope" +#define _ROPE_KERNEL_NAME CVIVANTE_NAMESPACE("cl.rope") + +// Add kernel hashtable here +#define STR(a) #a +#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \ + ((IN0_DTYPE) | (IN0_DTYPE << 8) | (OUT_DTYPE << 16) | (AXIS << 25)) +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \ + { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ), \ + CVIVANTE_NAMESPACE("cl.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_axis"STR(AXIS)), \ + "rope_0" } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _rope_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, F32, F32, 0 ), + PACK_KERNEL_MAP( F32, F32, F32, 1 ), + PACK_KERNEL_MAP( F32, F32, F32, 2 ), + PACK_KERNEL_MAP( I32, I32, I32, 0 ), + PACK_KERNEL_MAP( I32, I32, I32, 1 ), + PACK_KERNEL_MAP( I32, I32, I32, 2 ), + PACK_KERNEL_MAP( U32, U32, U32, 0 ), + PACK_KERNEL_MAP( U32, U32, U32, 1 ), + PACK_KERNEL_MAP( U32, U32, U32, 2 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _rope_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def ) +#define SCALAR_AXIS (4) +#define SCALAR_IN_ZP (5) +#define SCALAR_COS_ZP (6) +#define SCALAR_SIN_ZP (7) +#define SCALAR_SCALE0 (8) +#define SCALAR_SCALE1 (9) +#define SCALAR_OUT_ZP (10) +#define SCALAR_HALF_HEAD_SIZE (11) +#define SCALAR_STEP (12) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_rope_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t* attr[2] = { NULL }; + int32_t axis = 0; + vsi_size_array_t* out_shape = NULL; + vsi_size_t shape[3] = { 1 }; + + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis); + CHECK_STATUS_FAIL_GOTO(status, final); + + out_shape = attr[1]->shape; + shape[0] = out_shape->data[0]; + shape[1] = out_shape->data[1]; + shape[2] = out_shape->data[2]; + shape[axis] = shape[axis] / 2; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = shape[0]; + gpu_param.global_size[1] = shape[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? shape[2] : 1; + + status = vsi_nn_kernel_gpu_config(node, &gpu_param); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _rope_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e in2_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _rope_kernel_map; + size_t kernel_map_size = _cnt_of_array( _rope_kernel_map ); + vx_param_description_t * param_def = _rope_kernel_param_def; + vx_kernel_initialize_f initializer = _rope_initializer; + + uint32_t key = 0; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type); + in2_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + +#define _PACK_SELECT_KEY( in0_type, in1_type, in2_type, out_type ) \ + ((in0_type) | (in1_type << 8) | (in2_type << 16) | (out_type << 24)) + switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype)) + { + case _PACK_SELECT_KEY(F32, F32, F32, F32): + case _PACK_SELECT_KEY(F16, F16, F16, F16): + key = ROPE_HASH_KEY(F32, F32, F32, axis); + break; + case _PACK_SELECT_KEY(U8, U8, U8, U8): + case _PACK_SELECT_KEY(U16, U16, U16, U16): + key = ROPE_HASH_KEY(U32, U32, U32, axis); + break; + case _PACK_SELECT_KEY(I8, I8, I8, I8): + case _PACK_SELECT_KEY(I16, I16, I16, I16): + case _PACK_SELECT_KEY(I32, I32, I32, I32): + key = ROPE_HASH_KEY(I32, I32, I32, axis); + break; + default: + break; + } +#undef _PACK_SELECT_KEY + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis"); + int32_t interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved"); + float in_scale = vsi_nn_get_tensor_scale(inputs[0]); + float cos_scale = vsi_nn_get_tensor_scale(inputs[1]); + float sin_scale = vsi_nn_get_tensor_scale(inputs[2]); + float out_scale = vsi_nn_get_tensor_scale(outputs[0]); + float in_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float cos_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]); + float sin_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + int32_t half_head_size = interleaved ? 1 : (int32_t)(inputs[0]->attr.size[axis] / 2); + float scale0 = in_scale * cos_scale / out_scale; + float scale1 = in_scale * sin_scale / out_scale; + int32_t step = interleaved ? 2 : 1; + int32_t i = 0; + + // Check if gpu can support the size + if ( !vsi_nn_kernel_gpu_check_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, axis ); + if (VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis); + node_params[SCALAR_IN_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &in_zp); + node_params[SCALAR_COS_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &cos_zp); + node_params[SCALAR_SIN_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &sin_zp); + node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create( + graph, F32, &scale0); + node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create( + graph, F32, &scale1); + node_params[SCALAR_OUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &output_zp); + node_params[SCALAR_HALF_HEAD_SIZE] = vsi_nn_kernel_scalar_create( + graph, I32, &half_head_size); + node_params[SCALAR_STEP] = vsi_nn_kernel_scalar_create( + graph, I32, &step); + status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM ); + } + } + + for (i = SCALAR_AXIS; i < (int32_t)_ROPE_PARAM_NUM; i++) + { + if (node_params[i]) + { + vsi_nn_kernel_scalar_release(&node_params[i]); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( rope, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c index 97d0db9..97cbd4a 100644 --- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c @@ -27,6 +27,7 @@ #include #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_tensor.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" @@ -299,7 +300,7 @@ static vsi_nn_kernel_node_t _setup VSI_UNREFERENCED(output_num); #if (VX_ACTIVATION_EXT_SUPPORT) - if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) + if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver) { return NULL; } diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c index 78b9a9b..3a698fe 100644 --- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c @@ -26,6 +26,7 @@ #include #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_tensor.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" @@ -457,7 +458,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool is_odd_even_sort = FALSE; vsi_bool is_bitnoic_segment = FALSE; size_t param_num = _TOPK_PARAM_NUM; - int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2); + int32_t max_stages = 7 + (int32_t)log2(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >> 2); vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -483,6 +484,11 @@ static vsi_nn_kernel_node_t _setup return NULL; } + if (block_size >= GPU_TENSOR_MAX_WIDTH) + { + return NULL; + } + shape[0][0] = block_size; shape[0][1] = block_num; shape[1][0] = top_k; diff --git a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c index 75623dd..95435d2 100644 --- a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c @@ -27,6 +27,7 @@ #include #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_tensor.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" @@ -192,7 +193,7 @@ static vsi_bool _bucketize_support_types return FALSE; } - if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2) + if (in_dtype == F16 && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver != VSI_NN_HW_EVIS_2) { return FALSE; } diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c index 0e4e1fe..ee8c8fe 100644 --- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c @@ -27,6 +27,7 @@ #include #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_tensor.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" @@ -771,7 +772,8 @@ static vsi_nn_kernel_node_t _setup temp_tensor[1] = weights; temp_tensor[2] = biases; - ks = get_kernel_size(weights->attr.size[0], dilation, stride, graph->ctx->config.evis.ver); + ks = get_kernel_size(weights->attr.size[0], dilation, stride, + ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver); status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks); diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c index ce13b84..79e5b02 100644 --- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c @@ -121,7 +121,9 @@ static const _kernel_map_type _groupnorm_sums_kernel_map[] = TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 ) TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 ) TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SUMS_KERNELS( U16, F32, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SUMS_KERNELS_2D( U16, F32, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 ) }; @@ -174,6 +176,9 @@ static const _kernel_map_type _groupnorm_kernel_map[] = TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 ) + + TENSOR_GROUPNORM_SCALE_KERNELS( U16, F32, U16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( U16, F32, U16, KERNEL_SOURCE_2 ) }; /* @@ -245,6 +250,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer) float sum_x2_tail0 = 1; float sum_x2_tail1 = 1; float work_item_pixels = 1; + vsi_bool is_input_8bits = FALSE; VSI_UNREFERENCED(param_size); @@ -263,12 +269,13 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer) width = (int32_t)(input_shape->data[0]); height = (int32_t)(input_shape->data[1]); chn = (int32_t)(attr[1]->shape->data[1]); + is_input_8bits = attr[0]->dtype == I8 || attr[0]->dtype == U8; if (is2D) { height = 1; } - work_item_pixels = (float)height * 16; + work_item_pixels = is_input_8bits ? 16 * (float)height : 8 * (float)height; sum_x_tail = -work_item_pixels * input_zp * input_scale; sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2; @@ -281,11 +288,11 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer) shaderParam.local_size[1] = 1; shaderParam.local_size[2] = 1; - if (attr[0]->dtype == I8 || attr[0]->dtype == U8) + if (is_input_8bits) { shaderParam.global_size[0] = (width + 255) / 256 * 16; } - else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16) { shaderParam.global_size[0] = (width + 127) / 128 * 16; } @@ -324,7 +331,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1); CHECK_STATUS_FAIL_GOTO(status, OnError ); } - else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16) { gpu_dp_inst_t uniSum_X_X2_8x2 = {{ 0x55555555, // TCfg @@ -483,7 +490,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) } shaderParam.global_scale[0] = 16; - if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16) { shaderParam.global_scale[0] = 8; } @@ -610,6 +617,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( U16, U16 ): case _PACK_SELECT_KEY( I16, I16 ): case _PACK_SELECT_KEY( I16, F16 ): case _PACK_SELECT_KEY( F16, F16 ): @@ -838,8 +846,7 @@ static vsi_nn_kernel_node_t _setup attr.is_const = FALSE; attr.vtl = TRUE; attr.size[0] = ((new_shape[0] + 255) / 256) * 4; - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 - || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) + if (in0_dtype == I16 || in0_dtype == F16 || in0_dtype == U16) { attr.size[0] = ((new_shape[0] + 127) / 128) * 4; } diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c index 4703424..5d7ae5a 100644 --- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c @@ -124,22 +124,23 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) {0, 0, 0} }; int8_t in0_fl = 0; - int32_t inputZP0 = 0; - float input_scale0 = 1.0f; - int32_t inputZP1 = 0; - float input_scale1 = 1.0f; + int32_t input0_zp = 0; + float input0_scale = 1.0f; + int32_t input1_zp = 0; + float input1_scale = 1.0f; + float output_zp = 0; int8_t out_fl = 0; - float outputZP = 0; - int32_t shift0 = 0; - vsi_bool is_ge_fl = FALSE; + int32_t shift0 = 0; + vsi_bool is_ge_fl = FALSE; + vsi_bool is_2d_img = FALSE; uint32_t evis_version = 0; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; vsi_size_array_t * out_shape = NULL; uint32_t pack_key; - vx_context ctx = vxGetContext((vx_reference)node); + vx_context ctx = vxGetContext((vx_reference)node); vx_hardware_caps_params_t hw_param; VSI_UNREFERENCED(param_size); @@ -165,34 +166,30 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); out_shape = attr[2]->shape; - inputZP0 = attr[0]->zero_point; - input_scale0 = attr[0]->scale; - inputZP1 = attr[1]->zero_point; - input_scale1 = attr[1]->scale; - outputZP = (float)attr[2]->zero_point; - input_scale0 = input_scale0 / attr[2]->scale; + input0_zp = attr[0]->zero_point; + input0_scale = attr[0]->scale; + input1_zp = attr[1]->zero_point; + input1_scale = attr[1]->scale; + output_zp = (float)attr[2]->zero_point; + input0_scale = input0_scale / attr[2]->scale; - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP && + attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP) { in0_fl = (int8_t)attr[0]->dfp.fl; - } - - if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { out_fl = (int8_t)attr[2]->dfp.fl; + shift0 = in0_fl - out_fl; + is_ge_fl = shift0 >= 0; } - shift0 = in0_fl - out_fl; - is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1); - is_ge_fl = shift0 >= 0; #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 ) \ (IN0_TYPE | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26)) - pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version ); + pack_key = _PACK_SELECT_KEY(attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version); - if ( attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl) + if (attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl) { gpu_param.global_scale[0] = 16; gpu_param.global_scale[1] = 1; @@ -204,7 +201,6 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; } - gpu_param.global_size[0] = gpu_align_p2( (out_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); @@ -215,97 +211,97 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) switch( pack_key ) { - case _PACK_SELECT_KEY( I8, I8, 1, 1, 2 ): - case _PACK_SELECT_KEY( I16, I16, 1, 1, 2 ): + case _PACK_SELECT_KEY(I8, I8, 1, 1, 2): + case _PACK_SELECT_KEY(I16, I16, 1, 1, 2): + { + gpu_dp_inst_t uniPreluDFPLo_2x8b = { { + 0x77777777, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0x00000000, // BSelt + 0x30201000, 0x70605040, // BBin + 0x00004000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPreluDFPHi_2x8b = { { + 0x77777777, // TCfg + 0x44444444, // ASelt + 0xbbaa9988, 0xffeeddcc, // ABin + 0x00000000, // BSelt + 0x30201000, 0x70605040, // BBin + 0x00004000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if (attr[0]->dtype == I16) { - gpu_dp_inst_t uniPreluDFPLo_2x8b = {{ - 0x77777777, // TCfg - 0x44444444, // ASelt - 0x33221100, 0x77665544, // ABin - 0x00000000, // BSelt - 0x30201000, 0x70605040, // BBin - 0x00004000, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPreluDFPHi_2x8b = {{ - 0x77777777, // TCfg - 0x44444444, // ASelt - 0xbbaa9988, 0xffeeddcc, // ABin - 0x00000000, // BSelt - 0x30201000, 0x70605040, // BBin - 0x00004000, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - - if ( attr[0]->dtype == I16 ) - { - uniPreluDFPLo_2x8b.data[7] = 0x00003000; - uniPreluDFPHi_2x8b.data[7] = 0x00003000; - } - - gpu_dp_inst_update_postshfit( &uniPreluDFPLo_2x8b, shift0 ); - gpu_dp_inst_update_postshfit( &uniPreluDFPHi_2x8b, shift0 ); - - status = vsi_nn_kernel_gpu_add_param( node, - "uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b ); - CHECK_STATUS_FAIL_GOTO(status, final ); + uniPreluDFPLo_2x8b.data[7] = 0x00003000; + uniPreluDFPHi_2x8b.data[7] = 0x00003000; } - break; - case _PACK_SELECT_KEY( I8, I8, 1, 1, 1 ): - case _PACK_SELECT_KEY( I16, I16, 1, 1, 1 ): - { - gpu_dp_inst_t uniPreluInt8_2x8 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0xb3a29180, 0xf7e6d5c4, // ABin - 0x66666666, // BSelt - 0x30201000, 0x70605040, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPreluInt16_part0_4x4 = {{ - 0x05050505, // TCfg - 0x00000000, // ASelt - 0x00510040, 0x00730062, // ABin - 0x06060606, // BSelt - 0x00100000, 0x00300020, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, - 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniPreluInt16_part1_4x4 = {{ - 0x05050505, // TCfg - 0x00000000, // ASelt - 0x00510040, 0x00730062, // ABin - 0x06060606, // BSelt - 0x00500040, 0x00700060, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, - 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_update_postshfit( &uniPreluInt8_2x8, shift0 ); - gpu_dp_inst_update_postshfit( &uniPreluInt16_part0_4x4, shift0 ); - gpu_dp_inst_update_postshfit( &uniPreluInt16_part1_4x4, shift0 ); + gpu_dp_inst_update_postshfit(&uniPreluDFPLo_2x8b, shift0); + gpu_dp_inst_update_postshfit(&uniPreluDFPHi_2x8b, shift0); - status = vsi_nn_kernel_gpu_add_param( node, - "uniPreluInt8_2x8", &uniPreluInt8_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4 ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - break; - case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 1 ): - case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 2 ): - case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 1 ): - case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 2 ): + status = vsi_nn_kernel_gpu_add_param(node, + "uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b); + CHECK_STATUS_FAIL_GOTO(status, final); + } + break; + case _PACK_SELECT_KEY(I8, I8, 1, 1, 1): + case _PACK_SELECT_KEY(I16, I16, 1, 1, 1): + { + gpu_dp_inst_t uniPreluInt8_2x8 = { { + 0x55555555, // TCfg + 0x00000000, // ASelt + 0xb3a29180, 0xf7e6d5c4, // ABin + 0x66666666, // BSelt + 0x30201000, 0x70605040, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPreluInt16_part0_4x4 = { { + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00510040, 0x00730062, // ABin + 0x06060606, // BSelt + 0x00100000, 0x00300020, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniPreluInt16_part1_4x4 = { { + 0x05050505, // TCfg + 0x00000000, // ASelt + 0x00510040, 0x00730062, // ABin + 0x06060606, // BSelt + 0x00500040, 0x00700060, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit(&uniPreluInt8_2x8, shift0); + gpu_dp_inst_update_postshfit(&uniPreluInt16_part0_4x4, shift0); + gpu_dp_inst_update_postshfit(&uniPreluInt16_part1_4x4, shift0); + + status = vsi_nn_kernel_gpu_add_param(node, + "uniPreluInt8_2x8", &uniPreluInt8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4); + CHECK_STATUS_FAIL_GOTO(status, final); + } + break; + case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 1): + case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 2): + case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 1): + case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 2): { gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ 0x11111111, // TCfg @@ -446,15 +442,15 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) status |= vsi_nn_kernel_gpu_add_param( node, "uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 ); status |= vsi_nn_kernel_gpu_add_param( node, - "inputZP0", &inputZP0 ); + "input0_zp", &input0_zp); status |= vsi_nn_kernel_gpu_add_param( node, - "input_scale0", &input_scale0 ); + "input0_scale", &input0_scale ); status |= vsi_nn_kernel_gpu_add_param( node, - "inputZP1", &inputZP1 ); + "input1_zp", &input1_zp); status |= vsi_nn_kernel_gpu_add_param( node, - "input_scale1", &input_scale1 ); + "input1_scale", &input1_scale ); status |= vsi_nn_kernel_gpu_add_param( node, - "outputZP", &outputZP ); + "output_zp", &output_zp ); if (attr[2]->dtype == F16) { status |= vsi_nn_kernel_gpu_add_param( node, diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index a63fc3a..63c72b2 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -27,6 +27,7 @@ #include #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_tensor.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" @@ -58,53 +59,92 @@ typedef enum #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt" #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_1" #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_2" +#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_3" +#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_4" +#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_5" #define STR(a) #a // Add kernel hashtable here -#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \ - (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag)) +#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag, same_type ) \ + (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (scale_flag << 16) | (same_type << 22)) -#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \ - { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \ +#define _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN, SAME_TYPE ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \ _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) } -#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \ - { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \ +#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \ + _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, TRUE ), \ + _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, FALSE ) + +#define _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP, SAME_TYPE ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \ _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) } -#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \ - { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \ +#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \ + _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, TRUE ), \ + _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, FALSE ) + +#define _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT, SAME_TYPE ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \ _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) } -#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \ - { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \ +#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \ + _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, TRUE ), \ + _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, FALSE ) + +#define PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, TRUE ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ "_SAME_2x_upsample_half_pixel_centers"), \ _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } -#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \ - { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \ +#define PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, TRUE ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ "_SAME_4x_upsample_half_pixel_centers"), \ _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } -#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \ - { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \ +#define PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, TRUE ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ "_SAME_8x_upsample_half_pixel_centers"), \ _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) } -#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \ - { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \ +#define PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, TRUE ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ "_SAME_3x_upsample_half_pixel_centers"), \ _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } +#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, FALSE ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_2x_upsample_half_pixel_centers"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, FALSE ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_4x_upsample_half_pixel_centers"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, FALSE ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_8x_upsample_half_pixel_centers"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, FALSE ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_3x_upsample_half_pixel_centers"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(IN_DTYPE) } + #define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \ - { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN, TRUE ), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ "_SAME_8x_upsample_align_corners"), \ "resize_bilinear_align_corners" } @@ -135,6 +175,10 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] = PACK_KERNEL_MAP_UP(F16, F16), PACK_KERNEL_MAP_UP(BF16, BF16), PACK_KERNEL_MAP_UP_OPT(U8, U8), + PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE(U8, U8), + PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE(U8, U8), + PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE(U8, U8), + PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE(U8, U8), PACK_KERNEL_MAP_UP_2X_HALF(U8, U8), PACK_KERNEL_MAP_UP_3X_HALF(U8, U8), PACK_KERNEL_MAP_UP_4X_HALF(U8, U8), @@ -672,18 +716,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_size_array_t * out_shape = NULL; - vsi_size_array_t * in_shape = NULL; + vsi_size_array_t * out_shape = NULL; + vsi_size_array_t * in_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; uint32_t depth = 0; uint32_t in_width = 0; uint32_t in_height = 0; uint32_t out_width = 0; uint32_t out_height = 0; + vsi_bool is_same_type = FALSE; vsi_bool is_2x_up_kernel = FALSE; vsi_bool is_3x_up_kernel = FALSE; vsi_bool is_4x_up_kernel = FALSE; vsi_bool is_8x_up_kernel = FALSE; + float scale = 1.f; + int32_t input_zp = 0; + int32_t output_zp = 0; VSI_UNREFERENCED(param_size); @@ -692,17 +741,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer) output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); - out_shape = output_attr->shape; - in_shape = input_attr->shape; - input_dtype = input_attr->dtype; + out_shape = output_attr->shape; + in_shape = input_attr->shape; + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; in_width = (uint32_t)(in_shape->data[0]); in_height = (uint32_t)(in_shape->data[1]); depth = (uint32_t)(in_shape->data[2]); out_width = (uint32_t)(out_shape->data[0]); out_height = (uint32_t)(out_shape->data[1]); + scale = input_attr->scale; + input_zp = input_attr->zero_point; + scale /= output_attr->scale; + output_zp = output_attr->zero_point; + is_same_type = _is_same_quant(input_attr, output_attr); - if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr))) + if ((U8 == input_dtype) && (output_dtype == U8)) { is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height); is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height); @@ -728,206 +783,303 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer) gpu_param.global_scale[2] = 1; } - if (is_2x_up_kernel) + if (is_2x_up_kernel || is_3x_up_kernel || is_4x_up_kernel || is_8x_up_kernel) { - gpu_dp_inst_t uniResize2xUp_0_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect - 0x00000704, // AccumType, ConstantType, and PostShift - 0x09030301, 0x03090103, 0x09030301, 0x03090103, - 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize2xUp_1_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect - 0x00000704, // AccumType, ConstantType, and PostShift - 0x09030301, 0x03090103, 0x09030301, 0x03090103, - 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant - }, GPU_DP_TYPE_16}; + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP[2] = { 0 }; + gpu_dp_inst_t uniU8PostProcess_2x8 = { { + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if (is_3x_up_kernel) - { - gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{ - 0x15515515, // TCfg - 0x00000000, // ASelt - 0x21210110, 0x03323202, // ABin - 0x2aa2aa2a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000610, // AccumType, ConstantType, and PostShift - 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555, - 0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{ - 0x05155155, // TCfg - 0x00000000, // ASelt - 0x54044343, 0x00650554, // ABin - 0x0a2aa2aa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000610, // AccumType, ConstantType, and PostShift - 0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa, - 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{ - 0x55551155, // TCfg - 0x50501050, // ASelt - 0x01011010, 0x21212121, // ABin - 0xaaaa22aa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x0000060f, // AccumType, ConstantType, and PostShift - 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, - 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{ - 0x11555511, // TCfg - 0x10505010, // ASelt - 0x32320202, 0x03033232, // ABin - 0x22aaaa22, // BSelt - 0x00000000, 0x00000000, // BBin - 0x0000060f, // AccumType, ConstantType, and PostShift - 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72, - 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{ - 0x55115555, // TCfg - 0x50105050, // ASelt - 0x43434343, 0x54540404, // ABin - 0xaa22aaaa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x0000060f, // AccumType, ConstantType, and PostShift - 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39, - 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{ - 0x00551155, // TCfg - 0x00501050, // ASelt - 0x05055454, 0x00006565, // ABin - 0x00aa22aa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x0000060f, // AccumType, ConstantType, and PostShift - 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, - 0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; + if (is_2x_up_kernel) + { + gpu_dp_inst_t uniResize2xUp_0_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect + 0x00000704, // AccumType, ConstantType, and PostShift + 0x09030301, 0x03090103, 0x09030301, 0x03090103, + 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize2xUp_1_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect + 0x00000704, // AccumType, ConstantType, and PostShift + 0x09030301, 0x03090103, 0x09030301, 0x03090103, + 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant + }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if (is_4x_up_kernel) - { - gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect - 0x00000406, // AccumType, ConstantType, and PostShift - 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, - 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect - 0x00000406, // AccumType, ConstantType, and PostShift - 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, - 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect - 0x00000406, // AccumType, ConstantType, and PostShift - 0x23150503, 0x31070701, 0x07310107, 0x15230305, - 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect - 0x00000406, // AccumType, ConstantType, and PostShift - 0x23150503, 0x31070701, 0x07310107, 0x15230305, - 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant - }, GPU_DP_TYPE_16}; + if (!is_same_type) + { + float f2i_radio = 16.0f; + gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift); + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio); - status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if (is_8x_up_kernel) - { - gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, - 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, - 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, - 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, - 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, - 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, - 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, - 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, - 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant - }, GPU_DP_TYPE_16}; + gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift); + uniResize2xUp_0_4x8.data[7] = 0x00000700; + uniResize2xUp_1_4x8.data[7] = 0x00000700; - status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); - CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8", + &uniU8PostProcess_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP); + CHECK_STATUS_FAIL_GOTO(status, final); + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final); + } + else if (is_3x_up_kernel) + { + gpu_dp_inst_t uniResize3xUp_l00_2x8 = { { + 0x15515515, // TCfg + 0x00000000, // ASelt + 0x21210110, 0x03323202, // ABin + 0x2aa2aa2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555, + 0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize3xUp_l01_2x8 = { { + 0x05155155, // TCfg + 0x00000000, // ASelt + 0x54044343, 0x00650554, // ABin + 0x0a2aa2aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa, + 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize3xUp_l10_4x4 = { { + 0x55551155, // TCfg + 0x50501050, // ASelt + 0x01011010, 0x21212121, // ABin + 0xaaaa22aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, + 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize3xUp_l11_4x4 = { { + 0x11555511, // TCfg + 0x10505010, // ASelt + 0x32320202, 0x03033232, // ABin + 0x22aaaa22, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72, + 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize3xUp_l12_4x4 = { { + 0x55115555, // TCfg + 0x50105050, // ASelt + 0x43434343, 0x54540404, // ABin + 0xaa22aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39, + 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize3xUp_l13_4x4 = { { + 0x00551155, // TCfg + 0x00501050, // ASelt + 0x05055454, 0x00006565, // ABin + 0x00aa22aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, + 0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if (!is_same_type) + { + float f2i_radio = 256.0f; + gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift); + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio); + + gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift); + uniResize3xUp_l00_2x8.data[7] = 0x00000608; + uniResize3xUp_l01_2x8.data[7] = 0x00000608; + uniResize3xUp_l10_4x4.data[7] = 0x00000607; + uniResize3xUp_l11_4x4.data[7] = 0x00000607; + uniResize3xUp_l12_4x4.data[7] = 0x00000607; + uniResize3xUp_l13_4x4.data[7] = 0x00000607; + + status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8", + &uniU8PostProcess_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP); + CHECK_STATUS_FAIL_GOTO(status, final); + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4); + CHECK_STATUS_FAIL_GOTO(status, final); + } + else if (is_4x_up_kernel) + { + gpu_dp_inst_t uniResize4xUp_l00_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize4xUp_l01_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize4xUp_l10_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x23150503, 0x31070701, 0x07310107, 0x15230305, + 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize4xUp_l11_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x23150503, 0x31070701, 0x07310107, 0x15230305, + 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant + }, GPU_DP_TYPE_16 }; + + if (!is_same_type) + { + float f2i_radio = 64.0f; + gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift); + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio); + + gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift); + uniResize4xUp_l00_4x8.data[7] = 0x00000400; + uniResize4xUp_l01_4x8.data[7] = 0x00000400; + uniResize4xUp_l10_4x8.data[7] = 0x00000400; + uniResize4xUp_l11_4x8.data[7] = 0x00000400; + + status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8", + &uniU8PostProcess_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP); + CHECK_STATUS_FAIL_GOTO(status, final); + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final); + } + else if (is_8x_up_kernel) + { + gpu_dp_inst_t uniResize8xUp_l00_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, + 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize8xUp_l01_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, + 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize8xUp_l10_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, + 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize8xUp_l11_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, + 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize8xUp_l20_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, + 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize8xUp_l21_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, + 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize8xUp_l30_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, + 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniResize8xUp_l31_4x8 = { { + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, + 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant + }, GPU_DP_TYPE_16 }; + + if (!is_same_type) + { + float f2i_radio = 256.0f; + gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift); + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio); + + gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift); + uniResize8xUp_l00_4x8.data[7] = 0x00000700; + uniResize8xUp_l01_4x8.data[7] = 0x00000700; + uniResize8xUp_l10_4x8.data[7] = 0x00000700; + uniResize8xUp_l11_4x8.data[7] = 0x00000700; + uniResize8xUp_l20_4x8.data[7] = 0x00000700; + uniResize8xUp_l21_4x8.data[7] = 0x00000700; + uniResize8xUp_l30_4x8.data[7] = 0x00000700; + uniResize8xUp_l31_4x8.data[7] = 0x00000700; + + status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8", + &uniU8PostProcess_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP); + CHECK_STATUS_FAIL_GOTO(status, final); + } + + status = vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8); + status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final); + } } else { @@ -1193,22 +1345,22 @@ static vsi_status _query_kernel if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0]) { - if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample) + if ((!align_corners) && (half_pixel_centers) && is_2x_upsample) { scale_flag = UP_2X_HALF; initializer = _bilinear_half_pixel_centers_opt_initializer; } - else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample) + else if ((!align_corners) && (half_pixel_centers) && is_3x_upsample) { scale_flag = UP_3X_HALF; initializer = _bilinear_half_pixel_centers_opt_initializer; } - else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample) + else if ((!align_corners) && (half_pixel_centers) && is_4x_upsample) { scale_flag = UP_4X_HALF; initializer = _bilinear_half_pixel_centers_opt_initializer; } - else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample) + else if ((!align_corners) && (half_pixel_centers) && is_8x_upsample) { scale_flag = UP_8X_HALF; initializer = _bilinear_half_pixel_centers_opt_initializer; @@ -1232,7 +1384,7 @@ static vsi_status _query_kernel scale_flag = DOWN; } - key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type); for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) { if( kernel_map[i].key == key ) @@ -1244,7 +1396,7 @@ static vsi_status _query_kernel if ((UP_OPT == scale_flag) && (i >= kernel_map_size)) { scale_flag = UP; - key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type); for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) { if( kernel_map[i].key == key ) @@ -1257,7 +1409,7 @@ static vsi_status _query_kernel if ((UP == scale_flag) && (i >= kernel_map_size)) { scale_flag = DOWN; - key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); + key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type); for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) { if( kernel_map[i].key == key ) @@ -1433,7 +1585,7 @@ static vsi_bool _is_image_width_lt16 size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype); vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8; - if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) + if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver) { return FALSE; } @@ -1468,7 +1620,8 @@ static vsi_nn_kernel_node_t _setup int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]); - vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2); + vsi_bool is_evis2 = \ + (vsi_bool)(((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_2); vsi_bool is_run_opt_kernel = FALSE; vsi_nn_tensor_t* scale = NULL; int32_t pad_left = half_pixel_centers ? 1 : 0; diff --git a/src/tim/vx/internal/src/kernel/evis/rope_evis.c b/src/tim/vx/internal/src/kernel/evis/rope_evis.c new file mode 100644 index 0000000..381abeb --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/rope_evis.c @@ -0,0 +1,744 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + B---batch + N---num_heads + S---sequence length + H---head size + */ +typedef enum +{ + LAYOUT_NONE, + LAYOUT_BNHS, + LAYOUT_BNH1, + LAYOUT_BSNH, + LAYOUT_BNSH, +} _internal_rope_layout_e; + +// Add kernel hashtable here +#define STR(a) #a +#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT, INTERLEAVED ) \ + ((IN0_DTYPE) | (IN1_DTYPE << 8) | (OUT_DTYPE << 16) | (LAYOUT << 24) | (INTERLEAVED << 28)) +#define PACK_KERNEL_BNHS_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNHS, 0 ), \ + CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnhs"), \ + "rope_0" } +#define PACK_KERNEL_BNH1_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNH1, 0 ), \ + CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnh1"), \ + "rope_1" } + +#define PACK_KERNEL_BSNH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BSNH, 1 ), \ + CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bsnh"), \ + "rope_2" } + +#define PACK_KERNEL_BNSH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNSH, 1 ), \ + CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnsh"), \ + "rope_3" } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +#define PACK_KERNEL_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + PACK_KERNEL_BNHS_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + PACK_KERNEL_BNH1_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + PACK_KERNEL_BSNH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + PACK_KERNEL_BNSH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), + +static const _kernel_map_type _rope_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( BF16, BF16, BF16) + PACK_KERNEL_MAP( F16, F16, F16 ) + PACK_KERNEL_MAP( I16, I16, I16 ) + PACK_KERNEL_MAP( I16, F16, I16 ) + PACK_KERNEL_MAP( I16, I16, I8 ) + PACK_KERNEL_MAP( I16, F16, I8 ) + PACK_KERNEL_MAP( I16, I16, U8 ) + PACK_KERNEL_MAP( I16, F16, U8 ) + PACK_KERNEL_MAP( U16, U16, U16 ) + PACK_KERNEL_MAP( U16, F16, U16 ) + PACK_KERNEL_MAP( I8, I8, I8 ) + PACK_KERNEL_MAP( I8, F16, I8 ) + PACK_KERNEL_MAP( U8, U8, U8 ) + PACK_KERNEL_MAP( U8, F16, U8 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _rope_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def ) +#define SCALAR_AXIS (4) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_rope_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t* out_attr = NULL; + vsi_nn_kernel_tensor_attr_t* in0_attr = NULL; + vsi_nn_kernel_tensor_attr_t* in1_attr = NULL; + vsi_nn_kernel_tensor_attr_t* in2_attr = NULL; + vsi_size_array_t* in_shape = NULL; + vsi_nn_kernel_dtype_e in0_dtype = F16; + vsi_nn_kernel_dtype_e in1_dtype = F16; + vsi_nn_kernel_dtype_e in2_dtype = F16; + vsi_nn_kernel_dtype_e out_dtype = F16; + float in0_scale = 1.0f; + float in1_scale = 1.0f; + float in2_scale = 1.0f; + float output_scale = 1.0f; + float output_zp = 0; + int32_t in0_zp = 0; + int32_t cos_zp = 0; + int32_t sin_zp = 0; + int32_t p = 0; + int32_t axis = 0; + int32_t interleaved = 0; + int32_t half_head_size = 1; + vsi_size_t shape[3] = {1}; + uint32_t pack_key = 0; + + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param); + VSI_UNREFERENCED(param_size); + // Add initializer + + in0_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]); + CHECK_PTR_FAIL_GOTO(in0_attr, "Create tensor attr buffer fail.", final); + in1_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]); + CHECK_PTR_FAIL_GOTO(in1_attr, "Create tensor attr buffer fail.", final); + in2_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]); + CHECK_PTR_FAIL_GOTO(in2_attr, "Create tensor attr buffer fail.", final); + out_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[3]); + CHECK_PTR_FAIL_GOTO(out_attr, "Create tensor attr buffer fail.", final); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &p); + CHECK_STATUS_FAIL_GOTO(status, final); + + axis = p & 0xFFFF; + interleaved = (p >> 16) & 0xFFFF; + + in_shape = in0_attr->shape; + in0_dtype = in0_attr->dtype; + in1_dtype = in1_attr->dtype; + in2_dtype = in2_attr->dtype; + out_dtype = out_attr->dtype; + + in0_scale = in0_attr->scale; + in1_scale = in1_attr->scale; + in2_scale = in2_attr->scale; + in0_zp = -in0_attr->zero_point; + cos_zp = -in1_attr->zero_point; + sin_zp = -in2_attr->zero_point; + output_scale = out_attr->scale; + output_zp = (float)out_attr->zero_point; + + half_head_size = (int32_t)(in_shape->data[axis] / 2); + shape[0] = in_shape->data[0]; + shape[1] = in_shape->data[1]; + shape[2] = in_shape->data[2]; + shape[axis] = half_head_size; + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((shape[0] + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = shape[1]; + gpu_param.global_size[2] = shape[2]; + +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \ + ((IN0_TYPE) | (IN1_TYPE << 8) | (IN2_TYPE << 16) | (OUT_TYPE << 24)) + + pack_key = _PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype); + switch (pack_key) + { + case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = { { + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = { { + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = { { + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + if (interleaved && axis == 0) + { + uniExtractOddData_2x8.data[1] = 0x10101010; + uniExtractOddData_2x8.data[2] = 0x03030101; + uniExtractOddData_2x8.data[3] = 0x07070505; + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, + "half_head_size", &half_head_size); + CHECK_STATUS_FAIL_GOTO(status, final); + } + status = vsi_nn_kernel_gpu_add_param(node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, final); + } + break; + case _PACK_SELECT_KEY(I16, I16, I16, I16): + case _PACK_SELECT_KEY(I16, F16, F16, I16): + case _PACK_SELECT_KEY(I16, I16, I16, I8): + case _PACK_SELECT_KEY(I16, F16, F16, I8): + case _PACK_SELECT_KEY(I16, I16, I16, U8): + case _PACK_SELECT_KEY(I16, F16, F16, U8): + case _PACK_SELECT_KEY(F16, F16, F16, F16): + { + float scale0 = in0_scale * in1_scale / output_scale; + float scale1 = in0_scale* in2_scale / output_scale; + gpu_dp_inst_t uniExtractHalf8_2x8 = { { + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = { { + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniATimesB_0_4x4 = { { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniATimesB_1_4x4 = { { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x01010101, // BSelt + 0x00050004, 0x00070006, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAEvenTimesB_0_4x4 = { { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x01010101, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAEvenTimesB_1_4x4 = { { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x01010101, // BSelt + 0x00050004, 0x00070006, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAOddTimesB_0_4x4 = { { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00030001, 0x00070005, // ABin + 0x01010101, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAOddTimesB_1_4x4 = { { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00030001, 0x00070005, // ABin + 0x01010101, // BSelt + 0x00050004, 0x00070006, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if (interleaved && axis == 0) + { + uniExtractHalf8_2x8.data[1] = 0x10101010; + uniExtractHalf8_2x8.data[2] = 0x02020000; + uniExtractHalf8_2x8.data[3] = 0x06060404; + uniExtractInteger_2x8.data[1] = 0x10101010; + uniExtractInteger_2x8.data[2] = 0x01010000; + uniExtractInteger_2x8.data[3] = 0x03030202; + + status = vsi_nn_kernel_gpu_add_param(node, + "uniAEvenTimesB_0_4x4", &uniAEvenTimesB_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniAEvenTimesB_1_4x4", &uniAEvenTimesB_1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniAOddTimesB_0_4x4", &uniAOddTimesB_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniAOddTimesB_1_4x4", &uniAOddTimesB_1_4x4); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, + "uniATimesB_0_4x4", &uniATimesB_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniATimesB_1_4x4", &uniATimesB_1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "half_head_size", &half_head_size); + } + status |= vsi_nn_kernel_gpu_add_param(node, + "scale0", &scale0); + status |= vsi_nn_kernel_gpu_add_param(node, + "scale1", &scale1); + status |= vsi_nn_kernel_gpu_add_param(node, + "output_zp", &output_zp); + if (out_dtype == F16) + { + status |= vsi_nn_kernel_gpu_add_param(node, + "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else + { + status |= vsi_nn_kernel_gpu_add_param(node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + CHECK_STATUS_FAIL_GOTO(status, final); + } + break; + case _PACK_SELECT_KEY(I8, I8, I8, I8): + case _PACK_SELECT_KEY(U8, U8, U8, U8): + case _PACK_SELECT_KEY(U16, U16, U16, U16): + case _PACK_SELECT_KEY(I8, F16, F16, I8): + case _PACK_SELECT_KEY(U8, F16, F16, U8): + case _PACK_SELECT_KEY(U16, F16, F16, U16): + { + float scale0 = in0_scale * in1_scale / output_scale; + float scale1 = in0_scale* in2_scale / output_scale; + gpu_dp_inst_t uniExtractInteger_2x8 = { { + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAMinusZp_0_4x4 = { { + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAMinusZp_1_4x4 = { { + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAEvenMinusZp_4x4 = { { + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAOddMinusZp_4x4 = { { + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00030001, 0x00070005, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + if (interleaved && axis == 0) + { + uniExtractInteger_2x8.data[1] = 0x10101010; + uniExtractInteger_2x8.data[2] = 0x01010000; + uniExtractInteger_2x8.data[3] = 0x03030202; + + status = vsi_nn_kernel_gpu_add_param(node, + "uniAEvenMinusZp_4x4", &uniAEvenMinusZp_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniAOddMinusZp_4x4", &uniAOddMinusZp_4x4); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, + "half_head_size", &half_head_size); + } + status |= vsi_nn_kernel_gpu_add_param(node, + "uniAMinusZp_0_4x4", &uniAMinusZp_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniAMinusZp_1_4x4", &uniAMinusZp_1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, + "scale0", &scale0); + status |= vsi_nn_kernel_gpu_add_param(node, + "scale1", &scale1); + status |= vsi_nn_kernel_gpu_add_param(node, + "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, + "in0_zp", &in0_zp); + status |= vsi_nn_kernel_gpu_add_param(node, + "cos_zp", &cos_zp); + status |= vsi_nn_kernel_gpu_add_param(node, + "sin_zp", &sin_zp); + status |= vsi_nn_kernel_gpu_add_param(node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8); + CHECK_STATUS_FAIL_GOTO(status, final); + } + break; + default: + break; + } + status = vsi_nn_kernel_gpu_config(node, &gpu_param); +final: + if (in0_attr) vsi_nn_kernel_tensor_attr_release(&in0_attr); + if (in1_attr) vsi_nn_kernel_tensor_attr_release(&in1_attr); + if (in2_attr) vsi_nn_kernel_tensor_attr_release(&in2_attr); + if (out_attr) vsi_nn_kernel_tensor_attr_release(&out_attr); + return status; +} /* _rope_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + int32_t interleaved, + _internal_rope_layout_e *layout + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e in2_dtype; + vsi_nn_kernel_dtype_e out_dtype; + int32_t in0_zp = vsi_nn_get_tensor_zero_point(inputs[0]); + int32_t in1_zp = vsi_nn_get_tensor_zero_point(inputs[1]); + int32_t in2_zp = vsi_nn_get_tensor_zero_point(inputs[2]); + const _kernel_map_type * kernel_map = _rope_kernel_map; + size_t kernel_map_size = _cnt_of_array( _rope_kernel_map ); + vx_param_description_t * param_def = _rope_kernel_param_def; + vx_kernel_initialize_f initializer = _rope_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + /*only support symmetric int16*/ + if ( ( (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I16) || + (in0_dtype == I16 && in1_dtype == F16 && out_dtype == I16) || + (in0_dtype == I16 && in1_dtype == F16 && out_dtype == I8) || + (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I8) || + (in0_dtype == I16 && in1_dtype == F16 && out_dtype == U8) || + (in0_dtype == I16 && in1_dtype == I16 && out_dtype == U8) ) && + (in0_zp != 0 || in1_zp != 0 || in2_zp != 0)) + { + return VSI_FAILURE; + } + + if (axis == 1 && inputs[0]->attr.size[0] == inputs[1]->attr.size[0] && + in1_dtype == in2_dtype) + { + if (inputs[0]->attr.size[0] == 1) + { + *layout = LAYOUT_BNH1; + } + else + { + *layout = LAYOUT_BNHS; + } + } + else if (axis == 0 && in1_dtype == in2_dtype) + { + if (inputs[0]->attr.size[2] == inputs[1]->attr.size[2] && + inputs[1]->attr.size[1] == 1) + { + *layout = LAYOUT_BSNH; + } + else + { + *layout = LAYOUT_BNSH; + } + } + + key = ROPE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, *layout, interleaved); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + int32_t i = 0; + int32_t interleaved = 0; + int32_t param = 0; + vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_nn_tensor_t* rs_tensors[4] = { NULL }; + vsi_nn_tensor_t* reshape_tensors[4] = { NULL }; + _internal_rope_layout_e layout = LAYOUT_NONE; + + VSI_UNREFERENCED(params); + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved"); + + // Check if gpu can support the size + if ( !vsi_nn_kernel_gpu_check_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, axis, interleaved, &layout ); + if (outputs[0]->attr.size[0] == 1 || layout == LAYOUT_BSNH) + { + memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + + if (outputs[0]->attr.size[0] == 1) + { + for (i = 1; i < 3; i++) + { + shape[0][i - 1] = shape[0][i]; + shape[1][i - 1] = shape[1][i]; + shape[2][i - 1] = shape[2][i]; + } + shape[0][2] = 1; + shape[1][2] = 1; + shape[2][2] = 1; + } + else + { + int32_t j = 0; + for (i = 0; i < 3; i++) + { + if (shape[1][i] != 1) + { + shape[1][j] = shape[1][i]; + j ++; + } + } + for (; j < 3; j++) + { + shape[1][j] = 1; + } + } + + rs_tensors[0] = vsi_nn_reshape_tensor(graph, + inputs[0], shape[0], inputs[0]->attr.dim_num); + rs_tensors[1] = vsi_nn_reshape_tensor(graph, + inputs[1], shape[1], inputs[1]->attr.dim_num); + rs_tensors[2] = vsi_nn_reshape_tensor(graph, + inputs[2], shape[1], inputs[2]->attr.dim_num); + rs_tensors[3] = vsi_nn_reshape_tensor(graph, + outputs[0], shape[2], outputs[0]->attr.dim_num); + + if (outputs[0]->attr.size[0] == 1 && axis > 0) + { + axis--; + } + reshape_tensors[0] = rs_tensors[0]; + reshape_tensors[1] = rs_tensors[1]; + reshape_tensors[2] = rs_tensors[2]; + reshape_tensors[3] = rs_tensors[3]; + } + else + { + reshape_tensors[0] = inputs[0]; + reshape_tensors[1] = inputs[1]; + reshape_tensors[2] = inputs[2]; + reshape_tensors[3] = outputs[0]; + } + + param = (interleaved << 16) | axis; + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[3], output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, ¶m); + status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM ); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_AXIS]); + } + } + + for (i = 0; i < 4; i++) + { + vsi_safe_release_tensor(rs_tensors[i]); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( rope, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c index 4786abb..8662c71 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c @@ -186,18 +186,26 @@ static const _kernel_map_type scatter_nd_update_special_ref_map[] = { TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4) }; static const _kernel_map_type scatter_nd_update_special_update_map[] = { TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4) }; static const _kernel_map_type scatter_nd_update_special_copy_map[] = { TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4) }; /* @@ -563,6 +571,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer) { case _PACK_SELECT_KEY( I8, I8 ): case _PACK_SELECT_KEY( U8, U8 ): + case _PACK_SELECT_KEY( I16, I16 ): + case _PACK_SELECT_KEY( U16, U16 ): { uint16_t M0 = 0; int32_t postShift0 = 0; @@ -605,6 +615,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( F16, F16 ): + break; default: break; } @@ -759,6 +771,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer) { case _PACK_SELECT_KEY( I8, I8 ): case _PACK_SELECT_KEY( U8, U8 ): + case _PACK_SELECT_KEY( I16, I16 ): + case _PACK_SELECT_KEY( U16, U16 ): { uint16_t M1 = 0; int32_t postShift1 = 0; @@ -801,6 +815,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( F16, F16 ): + break; default: break; } @@ -1597,6 +1613,19 @@ static vsi_status _query_kernel_special status |= VSI_FAILURE; } + if (input0_dtype == F16) + { + input0_dtype = U16; + } + if (input2_dtype == F16) + { + input2_dtype = U16; + } + if (output_dtype == F16) + { + output_dtype = U16; + } + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0); for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ ) diff --git a/src/tim/vx/internal/src/kernel/evis/swish_evis.c b/src/tim/vx/internal/src/kernel/evis/swish_evis.c index f1ad40b..9006707 100644 --- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c @@ -27,6 +27,7 @@ #include #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_tensor.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" @@ -591,7 +592,7 @@ static vsi_nn_kernel_node_t _setup VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); #if (VX_ACTIVATION_EXT_SUPPORT) - if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) + if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver) { return NULL; } diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index 8ff82f5..d4083d3 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -548,16 +548,16 @@ static vsi_status _gpu_register vsi_status status; vx_kernel_description_t* info; vx_kernel obj; - vsi_nn_context_t context; vx_program program = NULL; const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt; + vsi_nn_runtime_option_t* options; + options = ((vsi_nn_graph_prv_t*)graph)->options; #define MAX_BUILDPROGRAM_LEN 1024 char cmd[MAX_BUILDPROGRAM_LEN] = { 0 }; size_t cost_bytes = 0; memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN ); - context = graph->ctx; status = VSI_FAILURE; info = &(kernel->info); @@ -579,21 +579,21 @@ static vsi_status _gpu_register return status; } - if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE ) + if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE) { // set default evis version is 2 if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type ) { cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", - context->config.use_40bits_va ); + options->config.use_40bits_va ); } } else { cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", - context->config.evis.ver, context->config.use_40bits_va ); + options->config.evis.ver, options->config.use_40bits_va ); } // Pack build option if( kernel->gpu.sources[active_fmt].build_option.data ) @@ -655,16 +655,16 @@ static vsi_status _gpu_register_ext vsi_status status; vx_kernel_description_t* info; vx_kernel obj; - vsi_nn_context_t context; vx_program program = NULL; const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt; + vsi_nn_runtime_option_t* options; + options = ((vsi_nn_graph_prv_t*)graph)->options; #define MAX_BUILDPROGRAM_LEN 1024 char cmd[MAX_BUILDPROGRAM_LEN] = { 0 }; size_t cost_bytes = 0; memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN ); - context = graph->ctx; status = VSI_FAILURE; info = &(kernel->info); @@ -686,21 +686,21 @@ static vsi_status _gpu_register_ext return status; } - if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE ) + if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE) { // set default evis version is 2 if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type ) { cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", - context->config.use_40bits_va ); + options->config.use_40bits_va ); } } else { cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", - context->config.evis.ver, context->config.use_40bits_va ); + options->config.evis.ver, options->config.use_40bits_va ); } // Pack build option if( kernel->gpu.sources[active_fmt].build_option.data ) @@ -1258,7 +1258,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector } /* Skip evis if not support */ if( type == VSI_NN_KERNEL_TYPE_EVIS - && graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE ) + && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_NONE ) { continue; } @@ -1677,7 +1677,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph) int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader; #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT - if ( graph->ctx->config.subGroupSize == 0 ) + if ( ((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize == 0 ) { return FALSE; } diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index 92e94f6..bce1b01 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -162,15 +162,11 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(pow) #if (VX_TENSOR_GATHER_API_SUPPORT) REGISTER_VX_FIRST_KERNEL_SELECTOR(gather) #endif -#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT) REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops) -#endif #if (VX_TENSOR_TILE_API_SUPPORT) REGISTER_VX_FIRST_KERNEL_SELECTOR(tile) #endif -#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm) -#endif #if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT) REGISTER_VX_FIRST_KERNEL_SELECTOR(exp) #endif @@ -184,6 +180,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax) #if (VX_BITCAST_VX_SUPPORT) REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast) #endif - +REGISTER_VX_FIRST_KERNEL_SELECTOR(group_norm) +REGISTER_VX_FIRST_KERNEL_SELECTOR(instance_norm) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c new file mode 100644 index 0000000..cdfe633 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/group_norm_vx.c @@ -0,0 +1,89 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if VX_GROUP_NORMALIZATION_VX_SUPPORT +#define REGISTER_GROUP_NORM_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_GROUP_NORM_OPENVX_KERNEL(group_norm) +{ + vx_node node = NULL; + float eps = vsi_nn_kernel_param_get_float32(params, "eps"); + int32_t group_num = vsi_nn_kernel_param_get_int32(params, "group_num"); + vx_tensor inputs_tensor[3] = { NULL }; + vx_tensor output_tensor = NULL; + + inputs_tensor[0] = inputs[0]->t; + inputs_tensor[1] = inputs[1]->t; + inputs_tensor[2] = inputs[2]->t; + output_tensor = outputs[0]->t; + + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(kernel); + + if (graph->ctx->config.support_ffd || + graph->ctx->config.support_stream_processor) + { + node = vxGroupNormalizationLayer( + graph->g, + eps, + group_num, + inputs_tensor, + (vx_uint32)input_num, + output_tensor + ); + } + + return (vsi_nn_kernel_node_t)node; +} /* group_norm() */ + +#endif diff --git a/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c new file mode 100644 index 0000000..a363b41 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/instance_norm_vx.c @@ -0,0 +1,87 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if VX_INSTANCE_NORMALIZATION_VX_SUPPORT +#define REGISTER_INSTANCE_NORM_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_INSTANCE_NORM_OPENVX_KERNEL(instance_norm) +{ + vsi_nn_kernel_node_t node = NULL; + float eps = vsi_nn_kernel_param_get_float32(params, "eps"); + vx_tensor inputs_tensor[3] = { NULL }; + vx_tensor output_tensor = NULL; + + inputs_tensor[0] = inputs[0]->t; + inputs_tensor[1] = inputs[1]->t; + inputs_tensor[2] = inputs[2]->t; + output_tensor = outputs[0]->t; + + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(kernel); + + if (graph->ctx->config.support_ffd || + graph->ctx->config.support_stream_processor) + { + node = vxInstanceNormalizationLayer( + graph->g, + eps, + inputs_tensor, + (vx_uint32)input_num, + output_tensor + ); + } + + return (vsi_nn_kernel_node_t)node; +} /* instance_norm() */ + +#endif diff --git a/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c index 00a2def..3cdd73b 100644 --- a/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c @@ -30,7 +30,7 @@ #include "vsi_nn_tensor_util.h" #include "kernel/vsi_nn_kernel.h" -#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) +#if (VX_LAYER_NORMALIZATION_VX_SUPPORT) #define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \ static vsi_nn_kernel_node_t _##kernel_name##setup \ ( \ @@ -71,14 +71,20 @@ REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm ) inputs_tensor[2] = inputs[2]->t; output_tensor = outputs[0]->t; - node = vxLayerNormalizationLayer( - graph->g, - eps, - axis, - inputs_tensor, - (uint32_t)input_num, - output_tensor +#if !defined(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) || !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) + if (graph->ctx->config.support_ffd || + graph->ctx->config.support_stream_processor) +#endif + { + node = vxLayerNormalizationLayer( + graph->g, + eps, + axis, + inputs_tensor, + (uint32_t)input_num, + output_tensor ); + } return (vsi_nn_kernel_node_t)node; } /* layer_norm() */ diff --git a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c index c9a2c84..0d35c8d 100644 --- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c @@ -89,9 +89,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 ) if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) { vsi_nn_tensor_attr_t attr; + memcpy( &attr, &outputs[0]->attr, sizeof( attr ) ); memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) ); - attr.vtl = FALSE; + attr.vtl = TRUE; attr.is_const = FALSE; convert_tensor = vsi_nn_CreateTensor(graph, &attr); diff --git a/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c b/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c index 0d93b45..bfb26b9 100644 --- a/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/relationalops_vx.c @@ -30,7 +30,7 @@ #include "vsi_nn_tensor_util.h" #include "kernel/vsi_nn_kernel.h" -#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT) +#if (VX_RELATIONAL_OPS_VX_SUPPORT) #define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \ static vsi_nn_kernel_node_t _##kernel_name##setup \ @@ -68,12 +68,25 @@ REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops ) VSI_UNREFERENCED(kernel); VSI_UNREFERENCED(output_num); - node = vxRelationalLayer(graph->g, - operation, - inputs_tensor, - (uint32_t)input_num, - outputs[0]->t - ); +#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) + if (vsi_nn_is_broadcast_operaton(inputs, input_num, outputs[0])) + { + return NULL; + } +#endif + +#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) + if (graph->ctx->config.support_stream_processor) +#endif + { + node = vxRelationalLayer( + graph->g, + operation, + inputs_tensor, + (uint32_t)input_num, + outputs[0]->t + ); + } return (vsi_nn_kernel_node_t)node; } /* relational_ops() */ diff --git a/src/tim/vx/internal/src/kernel/vx/swish_vx.c b/src/tim/vx/internal/src/kernel/vx/swish_vx.c index 9b458c6..e758ced 100644 --- a/src/tim/vx/internal/src/kernel/vx/swish_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/swish_vx.c @@ -23,6 +23,7 @@ *****************************************************************************/ #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_tensor.h" #include "vsi_nn_node.h" #include "vsi_nn_log.h" @@ -66,7 +67,7 @@ REGISTER_SWISH_OPENVX_KERNEL( swish ) VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(input_num); - if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) + if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver) { swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type"); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl index bf9fd64..447f197 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl @@ -67,8 +67,8 @@ __kernel void cumsum_F32toF32_axis2( } } -#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \ -__kernel void cumsum_##name##toU8_axis2( \ +#define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \ +__kernel void cumsum_##name##_axis2( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ int axis, \ @@ -87,19 +87,19 @@ __kernel void cumsum_##name##toU8_axis2( \ int4 coord_out = coord; \ \ src_type sum = (src_type)(0); \ - uint4 dst = (uint4)(0); \ + dst_type dst = (dst_type)(0); \ int tmp_zp = convert_int_rte(output_zp); \ - dst.x = convert_uint_sat(tmp_zp); \ + dst.x = convert_dtype(tmp_zp); \ \ float cnt = 0.0f; \ \ if(exclusive && rev) \ { \ coord_out.z = channel - 1; \ - write_imageui(output, coord_out, dst); \ + image_write(output, coord_out, dst); \ for(coord.z = channel - 1; coord.z > 0; coord.z--) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ coord_out.z--; \ cnt += 1.0f; \ sum += data; \ @@ -107,17 +107,17 @@ __kernel void cumsum_##name##toU8_axis2( \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord_out, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord_out, dst); \ } \ } \ else if(exclusive) \ { \ coord_out.z = 0; \ - write_imageui(output, coord_out, dst); \ + image_write(output, coord_out, dst); \ for(coord.z = 0; coord.z < channel - 1; coord.z++) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ coord_out.z++; \ cnt += 1.0f; \ sum += data; \ @@ -125,45 +125,44 @@ __kernel void cumsum_##name##toU8_axis2( \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord_out, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord_out, dst); \ } \ } \ else if(rev) \ { \ for(coord.z = channel - 1; coord.z >= 0; coord.z--) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ cnt += 1.0f; \ sum += data; \ \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord, dst); \ } \ } \ else \ { \ for(coord.z = 0; coord.z < channel; coord.z++) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ cnt += 1.0f; \ sum += data; \ \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord, dst); \ } \ } \ } -CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui) -CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef) - - +CUMSUM_toINT_AXIS2_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_toINT_AXIS2_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_toINT_AXIS2_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte) __kernel void cumsum_F32toF32_axis1( __read_only image2d_array_t input, @@ -233,10 +232,10 @@ __kernel void cumsum_F32toF32_axis1( } } -#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \ -__kernel void cumsum_##name##toU8_axis1( \ - __read_only image2d_array_t input, \ - __write_only image2d_array_t output, \ +#define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \ +__kernel void cumsum_##name##_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ int axis, \ int exclusive, \ int rev, \ @@ -253,20 +252,20 @@ __kernel void cumsum_##name##toU8_axis1( \ int4 coord_out = coord; \ \ src_type sum = (src_type)(0); \ - uint4 dst = (uint4)(0); \ + dst_type dst = (dst_type)(0); \ int tmp_zp = convert_int_rte(output_zp); \ - dst.x = convert_uint_sat(tmp_zp); \ + dst.x = convert_dtype(tmp_zp); \ \ float cnt = 0; \ \ if(exclusive && rev) \ { \ coord_out.y = height - 1; \ - write_imageui(output, coord_out, dst); \ + image_write(output, coord_out, dst); \ \ for(coord.y = height - 1; coord.y > 0; coord.y--) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ cnt += 1.0f; \ coord_out.y--; \ sum += data; \ @@ -274,17 +273,17 @@ __kernel void cumsum_##name##toU8_axis1( \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord_out, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord_out, dst); \ } \ } \ else if(exclusive) \ { \ coord_out.y = 0; \ - write_imageui(output, coord_out, dst); \ + image_write(output, coord_out, dst); \ for(coord.y = 0; coord.y < height - 1; coord.y++) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ cnt += 1.0f; \ coord_out.y++; \ sum += data; \ @@ -292,44 +291,44 @@ __kernel void cumsum_##name##toU8_axis1( \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord_out, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord_out, dst); \ } \ } \ else if(rev) \ { \ for(coord.y = height - 1; coord.y >= 0; coord.y--) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ cnt += 1.0f; \ sum += data; \ \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord, dst); \ } \ } \ else \ { \ for(coord.y = 0; coord.y < height; coord.y++) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ cnt += 1.0f; \ sum += data; \ \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord, dst); \ } \ } \ } -CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui) -CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef) - +CUMSUM_toINT_AXIS1_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_toINT_AXIS1_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_toINT_AXIS1_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte) __kernel void cumsum_F32toF32_axis0( __read_only image2d_array_t input, @@ -399,8 +398,8 @@ __kernel void cumsum_F32toF32_axis0( } } -#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \ -__kernel void cumsum_##name##toU8_axis0( \ +#define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \ +__kernel void cumsum_##name##_axis0( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ int axis, \ @@ -419,19 +418,19 @@ __kernel void cumsum_##name##toU8_axis0( \ int4 coord_out = coord; \ \ src_type sum = (src_type)(0); \ - uint4 dst = (uint4)(0); \ + dst_type dst = (dst_type)(0); \ int tmp_zp = convert_int_rte(output_zp); \ - dst.x = convert_uint_sat(tmp_zp); \ + dst.x = convert_dtype(tmp_zp); \ \ float cnt = 0; \ \ if(exclusive && rev) \ { \ coord_out.x = width - 1; \ - write_imageui(output, coord_out, dst); \ + image_write(output, coord_out, dst); \ for(coord.x = width - 1; coord.x > 0; coord.x--) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ coord_out.x--; \ cnt += 1.0f; \ sum += data; \ @@ -439,8 +438,8 @@ __kernel void cumsum_##name##toU8_axis0( \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord_out, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord_out, dst); \ } \ } \ else if(exclusive) \ @@ -449,7 +448,7 @@ __kernel void cumsum_##name##toU8_axis0( \ write_imageui(output, coord_out, dst); \ for(coord.x = 0; coord.x < width - 1; coord.x++) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ coord_out.x++; \ cnt += 1.0f; \ sum += data; \ @@ -457,40 +456,42 @@ __kernel void cumsum_##name##toU8_axis0( \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord_out, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord_out, dst); \ } \ } \ else if(rev) \ { \ for(coord.x = width - 1; coord.x >= 0; coord.x--) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ cnt += 1.0f; \ sum += data; \ \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord, dst); \ } \ } \ else \ { \ for(coord.x = 0; coord.x < width; coord.x++) \ { \ - src_type data = read_image_type(input, coord); \ + src_type data = image_read(input, coord); \ cnt += 1.0f; \ sum += data; \ \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \ \ - dst.x = (uint)convert_int_rte(tmpSum); \ - write_imageui(output, coord, dst); \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord, dst); \ } \ } \ } -CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui) -CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef) +CUMSUM_toINT_AXIS0_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_toINT_AXIS0_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_toINT_AXIS0_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte) + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl index 3a90480..f89cf5e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl @@ -65,188 +65,100 @@ __kernel void cumsum_F32toF32_axis1_2D( } } -__kernel void cumsum_U8toU8_axis1_2D( - __read_only image2d_t input, - __write_only image2d_t output, - int axis, - int exclusive, - int rev, - int width, - int height, - int chn, - int input_zp, - float in_out_scale, - float in_out_zp_scale, - float output_zp - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - uint4 sum = (uint4)(0); - uint4 dst = (uint4)(0); - - int tmp_zp = convert_int_rte(output_zp); - dst.x = convert_uint_sat(tmp_zp); - - float cnt = 0; - - if(exclusive && rev) - { - coord.w = height - 1; - write_imageui(output, coord.zw, dst); - for(coord.y = height - 1; coord.y > 0; coord.y--) - { - uint4 data = read_imageui(input, coord.xy); - cnt += 1.0f; - coord.w--; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.zw, dst); - } - } - else if(exclusive) - { - write_imageui(output, coord.zw, dst); - for(coord.y = 0; coord.y < height - 1; coord.y++) - { - uint4 data = read_imageui(input, coord.xy); - cnt += 1.0f; - coord.w++; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.zw, dst); - } - } - else if(rev) - { - for(coord.y = height - 1; coord.y >= 0; coord.y--) - { - uint4 data = read_imageui(input, coord.xy); - cnt += 1.0f; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.xy, dst); - } - } - else - { - for(coord.y = 0; coord.y < height; coord.y++) - { - uint4 data = read_imageui(input, coord.xy); - cnt += 1.0f; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.xy, dst); - } - } -} - -__kernel void cumsum_F32toU8_axis1_2D( - __read_only image2d_t input, - __write_only image2d_t output, - int axis, - int exclusive, - int rev, - int width, - int height, - int chn, - int input_zp, - float in_out_scale, - float in_out_zp_scale, - float output_zp - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - float4 sum = (float4)(0); - uint4 dst = (uint4)(0); - int tmp_zp = convert_int_rte(output_zp); - dst.x = convert_uint_sat(tmp_zp); - - float cnt = 0; - - if(exclusive && rev) - { - coord.w = height - 1; - write_imageui(output, coord.zw, dst); - for(coord.y = height - 1; coord.y > 0; coord.y--) - { - float4 data = read_imagef(input, coord.xy); - cnt += 1.0f; - coord.w--; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.zw, dst); - } - } - else if(exclusive) - { - write_imageui(output, coord.zw, dst); - for(coord.y = 0; coord.y < height - 1; coord.y++) - { - float4 data = read_imagef(input, coord.xy); - cnt += 1.0f; - coord.w++; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.zw, dst); - } - } - else if(rev) - { - for(coord.y = height - 1; coord.y >= 0; coord.y--) - { - float4 data = read_imagef(input, coord.xy); - cnt += 1.0f; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.xy, dst); - } - } - else - { - for(coord.y = 0; coord.y < height; coord.y++) - { - float4 data = read_imagef(input, coord.xy); - cnt += 1.0f; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.xy, dst); - } - } +#define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \ +__kernel void cumsum_##name##_axis1_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, \ + int exclusive, \ + int rev, \ + int width, \ + int height, \ + int chn, \ + int input_zp, \ + float in_out_scale, \ + float in_out_zp_scale, \ + float output_zp \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + src_type sum = (src_type)(0); \ + dst_type dst = (dst_type)(0); \ + int tmp_zp = convert_int_rte(output_zp); \ + dst.x = convert_dtype(tmp_zp); \ + \ + float cnt = 0; \ + \ + if(exclusive && rev) \ + { \ + coord.w = height - 1; \ + image_write(output, coord.zw, dst); \ + for(coord.y = height - 1; coord.y > 0; coord.y--) \ + { \ + src_type data = image_read(input, coord.xy); \ + cnt += 1.0f; \ + coord.w--; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \ + \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord.zw, dst); \ + } \ + } \ + else if(exclusive) \ + { \ + image_write(output, coord.zw, dst); \ + for(coord.y = 0; coord.y < height - 1; coord.y++) \ + { \ + src_type data = image_read(input, coord.xy); \ + cnt += 1.0f; \ + coord.w++; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \ + \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord.zw, dst); \ + } \ + } \ + else if(rev) \ + { \ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \ + { \ + src_type data = image_read(input, coord.xy); \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \ + \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord.xy, dst); \ + } \ + } \ + else \ + { \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + src_type data = image_read(input, coord.xy); \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \ + \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord.xy, dst); \ + } \ + } \ } +CUMSUM_INT_AXIS1_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_INT_AXIS1_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte) __kernel void cumsum_F32toF32_axis0_2D( __read_only image2d_t input, @@ -316,188 +228,100 @@ __kernel void cumsum_F32toF32_axis0_2D( } } -__kernel void cumsum_U8toU8_axis0_2D( - __read_only image2d_t input, - __write_only image2d_t output, - int axis, - int exclusive, - int rev, - int width, - int height, - int chn, - int input_zp, - float in_out_scale, - float in_out_zp_scale, - float output_zp - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - uint4 sum = (uint4)(0); - uint4 dst = (uint4)(0); - - int tmp_zp = convert_int_rte(output_zp); - dst.x = convert_uint_sat(tmp_zp); - - float cnt = 0.0f; - - if(exclusive && rev) - { - coord.x = width - 1; - coord.z = coord.x; - write_imageui(output, coord.zw, dst); - for(; coord.x > 0; coord.x--) - { - uint4 data = read_imageui(input, coord.xy); - coord.z--; - cnt += 1.0; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.zw, dst); - } - } - else if(exclusive) - { - coord.z = 0; - write_imageui(output, coord.zw, dst); - for(coord.x = 0; coord.x < width - 1; coord.x++) - { - uint4 data = read_imageui(input, coord.xy); - cnt += 1.0f; - coord.z++; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.zw, dst); - } - } - else if(rev) - { - for(coord.x = width - 1; coord.x >= 0; coord.x--) - { - uint4 data = read_imageui(input, coord.xy); - cnt += 1.0f; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.xy, dst); - } - } - else - { - for(coord.x = 0; coord.x < width; coord.x++) - { - uint4 data = read_imageui(input, coord.xy); - cnt += 1.0f; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.xy, dst); - } - } -} - -__kernel void cumsum_F32toU8_axis0_2D( - __read_only image2d_t input, - __write_only image2d_t output, - int axis, - int exclusive, - int rev, - int width, - int height, - int chn, - int input_zp, - float in_out_scale, - float in_out_zp_scale, - float output_zp - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); - - float4 sum = (float4)(0); - uint4 dst = (uint4)(0); - int tmp_zp = convert_int_rte(output_zp); - dst.x = convert_uint_sat(tmp_zp); - - float cnt = 0.0f; - if(exclusive && rev) - { - coord.x = width - 1; - coord.z = coord.x; - write_imageui(output, coord.zw, dst); - for(; coord.x > 0; coord.x--) - { - float4 data = read_imagef(input, coord.xy); - coord.z--; - cnt += 1.0; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.zw, dst); - } - } - else if(exclusive) - { - coord.z = 0; - write_imageui(output, coord.zw, dst); - for(coord.x = 0; coord.x < width - 1; coord.x++) - { - float4 data = read_imagef(input, coord.xy); - cnt += 1.0f; - coord.z++; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.zw, dst); - } - } - else if(rev) - { - for(coord.x = width - 1; coord.x >= 0; coord.x--) - { - float4 data = read_imagef(input, coord.xy); - cnt += 1.0f; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.xy, dst); - } - } - else - { - for(coord.x = 0; coord.x < width; coord.x++) - { - float4 data = read_imagef(input, coord.xy); - cnt += 1.0f; - sum += data; - - float tmpAlpha = cnt * in_out_zp_scale + output_zp; - float tmpSum = sum.x * in_out_scale + tmpAlpha; - - dst.x = (uint)convert_int_rte(tmpSum); - write_imageui(output, coord.xy, dst); - } - } +#define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \ +__kernel void cumsum_##name##_axis0_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, \ + int exclusive, \ + int rev, \ + int width, \ + int height, \ + int chn, \ + int input_zp, \ + float in_out_scale, \ + float in_out_zp_scale, \ + float output_zp \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + src_type sum = (src_type)(0); \ + dst_type dst = (dst_type)(0); \ + \ + int tmp_zp = convert_int_rte(output_zp); \ + dst.x = convert_dtype(tmp_zp); \ + \ + float cnt = 0.0f; \ + \ + if(exclusive && rev) \ + { \ + coord.x = width - 1; \ + coord.z = coord.x; \ + image_write(output, coord.zw, dst); \ + for(; coord.x > 0; coord.x--) \ + { \ + src_type data = image_read(input, coord.xy); \ + coord.z--; \ + cnt += 1.0; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \ + \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord.zw, dst); \ + } \ + } \ + else if(exclusive) \ + { \ + coord.z = 0; \ + image_write(output, coord.zw, dst); \ + for(coord.x = 0; coord.x < width - 1; coord.x++) \ + { \ + src_type data = image_read(input, coord.xy); \ + cnt += 1.0f; \ + coord.z++; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \ + \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord.zw, dst); \ + } \ + } \ + else if(rev) \ + { \ + for(coord.x = width - 1; coord.x >= 0; coord.x--) \ + { \ + src_type data = image_read(input, coord.xy); \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \ + \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord.xy, dst); \ + } \ + } \ + else \ + { \ + for(coord.x = 0; coord.x < width; coord.x++) \ + { \ + src_type data = image_read(input, coord.xy); \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \ + \ + dst.x = convert_dtype(tmpSum); \ + image_write(output, coord.xy, dst); \ + } \ + } \ } +CUMSUM_INT_AXIS0_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_INT_AXIS0_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte) +CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl index e535b86..60163a7 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl @@ -132,3 +132,30 @@ __kernel void one_hot_U8toU8 coord.z ++; } while (coord.z < depth); } + +__kernel void one_hot_I32toBF16 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + int depth, + uint on_value, + uint off_value, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + int4 src = read_imagei(input, coord.xy); + + int val = convert_int(convert_float(src.x) * inputScale - inputTail); + do + { + uint4 dst; + dst.x = val == coord.z ? on_value : off_value; + + write_imageui(output, coord.xzyw, dst.xxxx); + + coord.z ++; + } while (coord.z < depth); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl new file mode 100644 index 0000000..08ad576 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/rope_0.cl @@ -0,0 +1,373 @@ +__kernel void rope_F32_F32toF32_axis0 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis, + float input_zp, + float cos_zp, + float sin_zp, + float scale0, + float scale1, + float output_zp, + int half_head_size, + int step + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + float4 cos, sin; + + READ_IMAGEF_2DARRAY(cos, cos_cache, coord); + READ_IMAGEF_2DARRAY(sin, sin_cache, coord); + coord.x = coord.x * step; + float4 src0 = read_imagef(input, coord); + int4 coord_out = coord; + + coord.x += half_head_size; + float4 src1 = read_imagef(input, coord); + + float4 dst0 = src0 * cos - src1 * sin; + float4 dst1 = src0 * sin + src1 * cos; + + write_imagef(output, coord_out, dst0); + coord_out.x += half_head_size; + write_imagef(output, coord_out, dst1); +} + +__kernel void rope_F32_F32toF32_axis1 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis, + float input_zp, + float cos_zp, + float sin_zp, + float scale0, + float scale1, + float output_zp, + int half_head_size, + int step + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + float4 cos, sin; + + READ_IMAGEF_2DARRAY(cos, cos_cache, coord); + READ_IMAGEF_2DARRAY(sin, sin_cache, coord); + coord.y = coord.y * step; + float4 src0 = read_imagef(input, coord); + int4 coord_out = coord; + coord.y += half_head_size; + float4 src1 = read_imagef(input, coord); + + float4 dst0 = src0 * cos - src1 * sin; + float4 dst1 = src0 * sin + src1 * cos; + + write_imagef(output, coord_out, dst0); + coord_out.y += half_head_size; + write_imagef(output, coord_out, dst1); +} + +__kernel void rope_F32_F32toF32_axis2 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis, + float input_zp, + float cos_zp, + float sin_zp, + float scale0, + float scale1, + float output_zp, + int half_head_size, + int step + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + + float4 cos = read_imagef(cos_cache, coord); + float4 sin = read_imagef(sin_cache, coord); + coord.z = coord.z * step; + float4 src0 = read_imagef(input, coord); + int4 coord_out = coord; + coord.z += half_head_size; + float4 src1 = read_imagef(input, coord); + + float4 dst0 = src0 * cos - src1 * sin; + float4 dst1 = src0 * sin + src1 * cos; + + write_imagef(output, coord_out, dst0); + coord_out.z += half_head_size; + write_imagef(output, coord_out, dst1); +} + +__kernel void rope_I32_I32toI32_axis0 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis, + float input_zp, + float cos_zp, + float sin_zp, + float scale0, + float scale1, + float output_zp, + int half_head_size, + int step + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 _cos, _sin; + float4 cos, sin; + + READ_IMAGEI_2DARRAY(_cos, cos_cache, coord); + READ_IMAGEI_2DARRAY(_sin, sin_cache, coord); + coord.x = coord.x * step; + float4 src0 = convert_float4(read_imagei(input, coord)); + int4 coord_out = coord; + + coord.x += half_head_size; + float4 src1 = convert_float4(read_imagei(input, coord)); + + src0 = src0 - input_zp; + src1 = src1 - input_zp; + cos = convert_float4(_cos) - cos_zp; + sin = convert_float4(_sin) - sin_zp; + + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp; + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp; + int4 dst0 = convert_int4_rte(_dst0); + int4 dst1 = convert_int4_rte(_dst1); + + write_imagei(output, coord_out, dst0); + coord_out.x += half_head_size; + write_imagei(output, coord_out, dst1); +} + +__kernel void rope_I32_I32toI32_axis1 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis, + float input_zp, + float cos_zp, + float sin_zp, + float scale0, + float scale1, + float output_zp, + int half_head_size, + int step + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 _cos, _sin; + float4 cos, sin; + + READ_IMAGEI_2DARRAY(_cos, cos_cache, coord); + READ_IMAGEI_2DARRAY(_sin, sin_cache, coord); + coord.y = coord.y * step; + float4 src0 = convert_float4(read_imagei(input, coord)); + int4 coord_out = coord; + + coord.y += half_head_size; + float4 src1 = convert_float4(read_imagei(input, coord)); + + src0 = src0 - input_zp; + src1 = src1 - input_zp; + cos = convert_float4(_cos) - cos_zp; + sin = convert_float4(_sin) - sin_zp; + + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp; + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp; + int4 dst0 = convert_int4_rte(_dst0); + int4 dst1 = convert_int4_rte(_dst1); + + write_imagei(output, coord_out, dst0); + coord_out.y += half_head_size; + write_imagei(output, coord_out, dst1); +} + +__kernel void rope_I32_I32toI32_axis2 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis, + float input_zp, + float cos_zp, + float sin_zp, + float scale0, + float scale1, + float output_zp, + int half_head_size, + int step + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + + float4 cos = convert_float4(read_imagei(cos_cache, coord)); + float4 sin = convert_float4(read_imagei(sin_cache, coord)); + coord.z = coord.z * step; + float4 src0 = convert_float4(read_imagei(input, coord)); + int4 coord_out = coord; + + coord.z += half_head_size; + float4 src1 = convert_float4(read_imagei(input, coord)); + + src0 = src0 - input_zp; + src1 = src1 - input_zp; + cos = cos - cos_zp; + sin = sin - sin_zp; + + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp; + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp; + int4 dst0 = convert_int4_rte(_dst0); + int4 dst1 = convert_int4_rte(_dst1); + + write_imagei(output, coord_out, dst0); + coord_out.z += half_head_size; + write_imagei(output, coord_out, dst1); +} + +__kernel void rope_U32_U32toU32_axis0 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis, + float input_zp, + float cos_zp, + float sin_zp, + float scale0, + float scale1, + float output_zp, + int half_head_size, + int step + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + uint4 _cos, _sin; + float4 cos, sin; + + READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord); + READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord); + coord.x = coord.x * step; + float4 src0 = convert_float4(read_imageui(input, coord)); + int4 coord_out = coord; + + coord.x += half_head_size; + float4 src1 = convert_float4(read_imageui(input, coord)); + + src0 = src0 - input_zp; + src1 = src1 - input_zp; + cos = convert_float4(_cos) - cos_zp; + sin = convert_float4(_sin) - sin_zp; + + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp; + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp; + uint4 dst0 = convert_uint4_rte(_dst0); + uint4 dst1 = convert_uint4_rte(_dst1); + + write_imageui(output, coord_out, dst0); + coord_out.x += half_head_size; + write_imageui(output, coord_out, dst1); +} + +__kernel void rope_U32_U32toU32_axis1 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis, + float input_zp, + float cos_zp, + float sin_zp, + float scale0, + float scale1, + float output_zp, + int half_head_size, + int step + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + uint4 _cos, _sin; + float4 cos, sin; + + READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord); + READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord); + coord.y = coord.y * step; + float4 src0 = convert_float4(read_imageui(input, coord)); + int4 coord_out = coord; + + coord.y += half_head_size; + float4 src1 = convert_float4(read_imageui(input, coord)); + + src0 = src0 - input_zp; + src1 = src1 - input_zp; + cos = convert_float4(_cos) - cos_zp; + sin = convert_float4(_sin) - sin_zp; + + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp; + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp; + uint4 dst0 = convert_uint4_rte(_dst0); + uint4 dst1 = convert_uint4_rte(_dst1); + + write_imageui(output, coord_out, dst0); + coord_out.y += half_head_size; + write_imageui(output, coord_out, dst1); +} + +__kernel void rope_U32_U32toU32_axis2 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis, + float input_zp, + float cos_zp, + float sin_zp, + float scale0, + float scale1, + float output_zp, + int half_head_size, + int step + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + + float4 cos = convert_float4(read_imageui(cos_cache, coord)); + float4 sin = convert_float4(read_imageui(sin_cache, coord)); + coord.z = coord.z * step; + float4 src0 = convert_float4(read_imageui(input, coord)); + int4 coord_out = coord; + + coord.z += half_head_size; + float4 src1 = convert_float4(read_imageui(input, coord)); + + src0 = src0 - input_zp; + src1 = src1 - input_zp; + cos = cos - cos_zp; + sin = sin - sin_zp; + + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp; + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp; + uint4 dst0 = convert_uint4_rte(_dst0); + uint4 dst1 = convert_uint4_rte(_dst1); + + write_imageui(output, coord_out, dst0); + coord_out.z += half_head_size; + write_imageui(output, coord_out, dst1); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx new file mode 100644 index 0000000..8a8d1ad --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_letterbox.vx @@ -0,0 +1,307 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int top; +_viv_uniform int left; +_viv_uniform float out_scale_r; +_viv_uniform float out_scale_g; +_viv_uniform float out_scale_b; +_viv_uniform float out_zp_r; +_viv_uniform float out_zp_g; +_viv_uniform float out_zp_b; +_viv_uniform float pad_v_r; +_viv_uniform float pad_v_g; +_viv_uniform float pad_v_b; +_viv_uniform float scale_w; +_viv_uniform float scale_h; +_viv_uniform int resize_max_w; +_viv_uniform int resize_max_h; +_viv_uniform int out_height; +_viv_uniform int r_order; +_viv_uniform int b_order; +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4; +_viv_uniform VXC_512Bits uniLeftToFloat32_4x4; +_viv_uniform VXC_512Bits uniExtactHalf8_2x8; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +__kernel void custom_letterbox_U8toU8 + ( + __read_only image2d_t input, + __write_only image2d_t output, + int top_, + int bottom_, + int left_, + int right_, + float mean_r_, + float mean_g_, + float mean_b_, + float scale_r_, + float scale_g_, + float scale_b_, + int pad_r_, + int pad_g_, + int pad_b_, + int reverse_channel + ) +{ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + int2 coord = coord_out; + uint4 dst = (uint4)(0,0,0,0); + vxc_uchar8 result; + + if (coord_out.x < left || coord_out.x >= resize_max_w || + coord_out.y < top || coord_out.y >= resize_max_h) + { + dst.x = convert_uint(pad_v_r); + coord.y = coord_out.y + r_order; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + dst.x = convert_uint(pad_v_g); + coord.y = coord_out.y + out_height; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + dst.x = convert_uint(pad_v_b); + coord.y = coord_out.y + b_order; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + return; + } + + float in_x = convert_float(coord_out.x - left) * scale_w; + float in_y = convert_float(coord_out.y - top) * scale_h; + float left_x_f = floor(in_x); + float top_y_f = floor(in_y); + float x_lerp = in_x - left_x_f; + float y_lerp = in_y - top_y_f; + int left_x_idx = convert_int(left_x_f); + int top_y_idx = convert_int(top_y_f); + + int2 coord_in = (int2)(3 * left_x_idx, top_y_idx); + vxc_uchar8 top_data, bottom_data; + + VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + + float4 left4 = (float4)(0,0,0,0); + float4 right4 = (float4)(0,0,0,0); + float4 top4 = (float4)(0,0,0,0); + float4 bottom4 = (float4)(0,0,0,0); + VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4); + bottom4 = right4 * x_lerp + left4; + float4 out = (bottom4 - top4) * y_lerp + top4; + + dst.x = convert_uint(out.x * out_scale_r + out_zp_r ); + coord.y = coord_out.y + r_order; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + dst.x = convert_uint(out.y * out_scale_g + out_zp_g); + coord.y = coord_out.y + out_height; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + dst.x = convert_uint(out.z * out_scale_b + out_zp_b); + coord.y = coord_out.y + b_order; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_letterbox_U8toI8 + ( + __read_only image2d_t input, + __write_only image2d_t output, + int top_, + int bottom_, + int left_, + int right_, + float mean_r_, + float mean_g_, + float mean_b_, + float scale_r_, + float scale_g_, + float scale_b_, + int pad_r_, + int pad_g_, + int pad_b_, + int reverse_channel + ) +{ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + int2 coord = coord_out; + int4 dst = (int4)(0,0,0,0); + vxc_char8 result; + + if (coord_out.x < left || coord_out.x >= resize_max_w || + coord_out.y < top || coord_out.y >= resize_max_h) + { + dst.x = convert_int(pad_v_r); + coord.y = coord_out.y + r_order; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + dst.x = convert_int(pad_v_g); + coord.y = coord_out.y + out_height; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + dst.x = convert_int(pad_v_b); + coord.y = coord_out.y + b_order; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + return; + } + + float in_x = convert_float(coord_out.x - left) * scale_w; + float in_y = convert_float(coord_out.y - top) * scale_h; + float left_x_f = floor(in_x); + float top_y_f = floor(in_y); + float x_lerp = in_x - left_x_f; + float y_lerp = in_y - top_y_f; + int left_x_idx = convert_int(left_x_f); + int top_y_idx = convert_int(top_y_f); + + int2 coord_in = (int2)(3 * left_x_idx, top_y_idx); + vxc_char8 top_data, bottom_data; + + VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + + float4 left4 = (float4)(0,0,0,0); + float4 right4 = (float4)(0,0,0,0); + float4 top4 = (float4)(0,0,0,0); + float4 bottom4 = (float4)(0,0,0,0); + VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4); + bottom4 = right4 * x_lerp + left4; + float4 out = (bottom4 - top4) * y_lerp + top4; + + dst.x = convert_int(out.x * out_scale_r + out_zp_r); + coord.y = coord_out.y + r_order; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + dst.x = convert_int(out.y * out_scale_g + out_zp_g); + coord.y = coord_out.y + out_height; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + dst.x = convert_int(out.z * out_scale_b + out_zp_b); + coord.y = coord_out.y + b_order; + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_letterbox_U8toF16 + ( + __read_only image2d_t input, + __write_only image2d_t output, + int top_, + int bottom_, + int left_, + int right_, + float mean_r_, + float mean_g_, + float mean_b_, + float scale_r_, + float scale_g_, + float scale_b_, + int pad_r_, + int pad_g_, + int pad_b_, + int reverse_channel + ) +{ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); + int2 coord = coord_out; + half4 tmp; + vxc_half8 dst_temp; + vxc_ushort8 dst; + + if (coord_out.x < left || coord_out.x >= resize_max_w || + coord_out.y < top || coord_out.y >= resize_max_h) + { + + float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0); + _viv_asm(CONV, tmp, pad); + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst, dst_temp, 16); + coord.y = coord_out.y + r_order; + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + tmp.x = tmp.y; + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst, dst_temp, 16); + coord.y = coord_out.y + out_height; + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + tmp.x = tmp.z; + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst, dst_temp, 16); + coord.y = coord_out.y + b_order; + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + return; + } + + float in_x = convert_float(coord_out.x - left) * scale_w; + float in_y = convert_float(coord_out.y - top) * scale_h; + float left_x_f = floor(in_x); + float top_y_f = floor(in_y); + float x_lerp = in_x - left_x_f; + float y_lerp = in_y - top_y_f; + int left_x_idx = convert_int(left_x_f); + int top_y_idx = convert_int(top_y_f); + + int2 coord_in = (int2)(3 * left_x_idx, top_y_idx); + vxc_uchar8 top_data, bottom_data; + + VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); + + float4 left4 = (float4)(0,0,0,0); + float4 right4 = (float4)(0,0,0,0); + float4 top4 = (float4)(0,0,0,0); + float4 bottom4 = (float4)(0,0,0,0); + VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4); + top4 = right4 * x_lerp + left4; + VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); + VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4); + bottom4 = right4 * x_lerp + left4; + float4 out = (bottom4 - top4) * y_lerp + top4; + + float4 out_temp = (float4)(0,0,0,0); + out_temp.x = out.x * out_scale_r + out_zp_r; + _viv_asm(CONV, tmp, out_temp); + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst, dst_temp, 16); + coord.y = coord_out.y + r_order; + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + out_temp.x = out.y * out_scale_g + out_zp_g; + _viv_asm(CONV, tmp, out_temp); + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst, dst_temp, 16); + coord.y = coord_out.y + out_height; + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + out_temp.x = out.z * out_scale_b + out_zp_b; + _viv_asm(CONV, tmp, out_temp); + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); + _viv_asm(COPY, dst, dst_temp, 16); + coord.y = coord_out.y + out_height; + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx index e3ca29e..432a228 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_softmax.vx @@ -10,7 +10,12 @@ #include "cl_viv_vx_ext.h" _viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32; +_viv_uniform VXC_512Bits uniExtract8Bin_2x8; _viv_uniform int sf_size; +_viv_uniform float srcScale; +_viv_uniform float srcZP; +_viv_uniform float dstScale; +_viv_uniform float dstZP; #define F_MAX(a,b) ((a)>(b)?(a):(b)) __kernel void Softmax2VXC ( @@ -19,35 +24,37 @@ __kernel void Softmax2VXC int axis ) { - int4 coord_in = (int4)(0,0,0,0); - float fMax = 0.0; + int4 coord_in = (int4)(0, get_global_id(0), 0, 0); + float fMax = 0; for (int i = 0; i < sf_size; i++) { - vxc_char8 val; + vxc_short8 val; + vxc_half8 val_h; coord_in.x = i; - VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, val_h, val, 16); float fval; - VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); + VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); fMax = F_MAX(fMax, fval); } - float fProbSum = 0.0f; vxc_short8 dst; for (int i = 0; i < sf_size; i++) { - vxc_char8 val; - + vxc_short8 val; + vxc_half8 val_h; coord_in.x = i; - VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, val_h, val, 16); float fval; - VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); - + VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); float fOut = (float)exp(fval - fMax); fProbSum += fOut; half hVal; - _viv_asm(CONV,hVal,fOut); - _viv_asm(COPY,dst,hVal, 4); + _viv_asm(CONV, hVal, fOut); + _viv_asm(COPY, dst, hVal, 4); + VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } @@ -56,15 +63,68 @@ __kernel void Softmax2VXC vxc_short8 val; vxc_half8 val_h; coord_in.x = i; - VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); float fval; _viv_asm(COPY, val_h,val, 16); VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); - - float fOut =fval/fProbSum; + float fOut =fval / fProbSum; half hVal; - _viv_asm(CONV,hVal,fOut); + _viv_asm(CONV, hVal, fOut); _viv_asm(COPY,dst,hVal, 4); VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } } + +__kernel void Softmax2VXC_u8 + ( + image2d_array_t input, + image2d_array_t output, + int axis + ) +{ + int4 coord_in = (int4)(0, get_global_id(0), 0, 0); + float fMax = -3.4e38f; + for (int i = 0; i < sf_size; i++) + { + vxc_uchar8 val; + coord_in.x = i; + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + float fval; + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); + fval = (fval - srcZP) * srcScale; + fMax = F_MAX(fMax, fval); + } + + float fProbSum = 0.0f; + vxc_uchar8 dst; + for (int i = 0; i < sf_size; i++) + { + vxc_uchar8 val; + + coord_in.x = i; + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + float fval; + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); + fval = (fval - srcZP) * srcScale; + float fOut = (float)exp(fval - fMax); + fProbSum += fOut; + } + + for (int i = 0; i < sf_size; i++) + { + vxc_uchar8 val; + coord_in.x = i; + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + float fval; + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); + fval = (fval - srcZP) * srcScale; + + float fOut = exp(fval - fMax) / fProbSum; + + fOut = fOut * dstScale + dstZP; + short dst0; + _viv_asm(CONV, dst0, fOut); + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); + VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx index 33edef8..15f596e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx @@ -16,7 +16,7 @@ _viv_uniform float sum_x2_tail1; _viv_uniform float output_scale; _viv_uniform float output_zp; -#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \ +#define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ @@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## int lidx = get_local_id(0); \ int gidz = get_global_id(1); \ int4 coord = (int4)(gidx, 0, gidz, 0); \ - vxc_short8 src0; \ + load_type src; \ src_type in_h; \ float4 sumsqr; \ float4 tmpSumSqr = (float4)(0); \ @@ -43,9 +43,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## { \ for(coord.y = 0; coord.y < height;) \ { \ - VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ coord.y++; \ - _viv_asm(COPY, in_h, src0, 16); \ + _viv_asm(COPY, in_h, src, 16); \ VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ tmpSumSqr += sumsqr; \ } \ @@ -76,10 +76,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## write_imagef(output, coord_out, data); \ } \ } -GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8) -GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8) +GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8, vxc_half8) +GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8, vxc_short8) +GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8) -#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \ +#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ @@ -89,7 +90,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## int lidx = get_local_id(0); \ \ int2 coord = (int2)(gidx, get_global_id(1)); \ - vxc_short8 src0; \ + load_type src; \ src_type in_h; \ float4 sumsqr = (float4)(0); \ \ @@ -98,8 +99,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## \ if(gidx < width) \ { \ - VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - _viv_asm(COPY, in_h, src0, 16); \ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, in_h, src, 16); \ VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \ sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \ @@ -128,8 +129,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## write_imagef(output, coord_out, data); \ } \ } -GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8) -GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8) +GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8) +GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8) +GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8) #define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \ @@ -178,7 +180,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( _viv_asm(CONV_RTE, tmpVal0, norm); \ norm = alpha * tmpData1 + bias_val; \ _viv_asm(CONV_RTE, tmpVal1, norm); \ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ _viv_asm(COPY, outval, dst, 16); \ VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ } @@ -230,10 +232,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ float4 norm; \ norm = alpha * tmpData0 + bias_val; \ - _viv_asm(CONV, tmpVal0, norm); \ + _viv_asm(CONV_RTE, tmpVal0, norm); \ norm = alpha * tmpData1 + bias_val; \ - _viv_asm(CONV, tmpVal1, norm); \ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(CONV_RTE, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ _viv_asm(COPY, outval, dst, 16); \ VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } @@ -283,7 +285,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \ float4 norm; \ norm = alpha * tmpData0 + bias_val; \ - _viv_asm(CONV, tmpVal0, norm); \ + _viv_asm(CONV_RTE, tmpVal0, norm); \ norm = alpha * tmpData1 + bias_val; \ _viv_asm(CONV_RTE, tmpVal1, norm); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ @@ -296,6 +298,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) +GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4) #define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \ @@ -333,10 +336,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ float4 norm; \ norm = alpha * tmpData0 + bias_val; \ - _viv_asm(CONV, tmpVal0, norm); \ + _viv_asm(CONV_RTE, tmpVal0, norm); \ norm = alpha * tmpData1 + bias_val; \ - _viv_asm(CONV, tmpVal1, norm); \ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(CONV_RTE, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ _viv_asm(COPY, outval, dst, 16); \ VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } @@ -346,4 +349,5 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) +GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx b/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx index 695601d..89d5b05 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/prelu.vx @@ -115,45 +115,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4; _viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4; _viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4; _viv_uniform VXC_512Bits uniExtact8Bin_2x8; -_viv_uniform int inputZP0; -_viv_uniform int inputZP1; -_viv_uniform float input_scale0; -_viv_uniform float input_scale1; -_viv_uniform float outputZP; -#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \ - __kernel void prelu_##name0##to##name1( \ +_viv_uniform int input0_zp; +_viv_uniform int input1_zp; +_viv_uniform float input0_scale; +_viv_uniform float input1_scale; +_viv_uniform float output_zp; +#define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \ + __kernel void prelu_##name( \ __read_only image2d_array_t input0, \ __read_only image2d_array_t input1, \ __write_only image2d_array_t output) \ {\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\ - vxc_float4 vecA, vecB, vecC, vecD;\ + float4 vecA, vecB, vecC, vecD;\ input_type0 srcA;\ copy_type0 src0;\ vxc_short8 srcB;\ vxc_half8 src1;\ - input_type0 input_ZP;\ + input_type0 zp;\ VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ _viv_asm(COPY, src0, srcA, 16); \ VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ _viv_asm(COPY, src1, srcB, 16); \ \ - _viv_asm(COPY, input_ZP, inputZP0, 4);\ - VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \ + _viv_asm(COPY, zp, input0_zp, 4);\ + VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \ uniDataSubZPtoFp32Part0_4x4); \ - VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \ + VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \ uniDataSubZPtoFp32Part1_4x4);\ VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\ VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\ \ - vecA = vecA * input_scale0;\ - vecB = vecB * input_scale0;\ - vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \ - vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \ - vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \ - vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \ - vecA = maxData0 + vecC * minData0 + outputZP;\ - vecB = maxData1 + vecD * minData1 + outputZP;\ + vecA = vecA * input0_scale;\ + vecB = vecB * input0_scale;\ + float4 maxData0 = vecA > 0 ? vecA : 0.0; \ + float4 maxData1 = vecB > 0 ? vecB : 0.0; \ + float4 minData0 = vecA < 0 ? vecA : 0.0; \ + float4 minData1 = vecB < 0 ? vecB : 0.0; \ + vecA = maxData0 + vecC * minData0 + output_zp;\ + vecB = maxData1 + vecD * minData1 + output_zp;\ convert_type dst0, dst1;\ _viv_asm(CONV_RTE, dst0, vecA);\ _viv_asm(CONV_RTE, dst1, vecB);\ @@ -164,49 +164,49 @@ _viv_uniform float outputZP; VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ } // name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type -PRELU_F16_3D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16) -PRELU_F16_3D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8) -PRELU_F16_3D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8) -PRELU_F16_3D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8) -PRELU_F16_3D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) -PRELU_F16_3D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) -PRELU_F16_3D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8) -PRELU_F16_3D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16) -PRELU_F16_3D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8) -PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16) +PRELU_F16_3D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16) +PRELU_F16_3D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8) +PRELU_F16_3D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8) +PRELU_F16_3D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8) +PRELU_F16_3D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) +PRELU_F16_3D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) +PRELU_F16_3D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8) +PRELU_F16_3D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16) +PRELU_F16_3D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8) +PRELU_F16_3D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16) -#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \ - __kernel void prelu_##name0##to##name1##_2D( \ +#define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \ + __kernel void prelu_##name##_2D( \ __read_only image2d_array_t input0, \ __read_only image2d_array_t input1, \ __write_only image2d_array_t output) \ {\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\ - vxc_float4 vecA, vecB, vecC, vecD;\ + float4 vecA, vecB, vecC, vecD;\ input_type0 srcA;\ copy_type0 src0;\ vxc_short8 srcB;\ vxc_half8 src1;\ - input_type0 input_ZP;\ + input_type0 zp;\ VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ _viv_asm(COPY, src0, srcA, 16); \ VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ _viv_asm(COPY, src1, srcB, 16); \ \ - _viv_asm(COPY, input_ZP, inputZP0, 4);\ - VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ - VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ + _viv_asm(COPY, zp, input0_zp, 4);\ + VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ + VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\ VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\ \ - vecA = vecA * input_scale0;\ - vecB = vecB * input_scale0;\ - vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \ - vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \ - vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \ - vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \ - vecA = maxData0 + vecC * minData0 + outputZP;\ - vecB = maxData1 + vecD * minData1 + outputZP;\ + vecA = vecA * input0_scale;\ + vecB = vecB * input0_scale;\ + float4 maxData0 = vecA > 0 ? vecA : 0.0; \ + float4 maxData1 = vecB > 0 ? vecB : 0.0; \ + float4 minData0 = vecA < 0 ? vecA : 0.0; \ + float4 minData1 = vecB < 0 ? vecB : 0.0; \ + vecA = maxData0 + vecC * minData0 + output_zp;\ + vecB = maxData1 + vecD * minData1 + output_zp;\ convert_type dst0, dst1;\ _viv_asm(CONV_RTE, dst0, vecA);\ _viv_asm(CONV_RTE, dst1, vecB);\ @@ -216,49 +216,49 @@ PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha _viv_asm(COPY, dst, dst2, 16); \ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ } -PRELU_F16_2D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8) -PRELU_F16_2D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16) -PRELU_F16_2D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8) -PRELU_F16_2D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) -PRELU_F16_2D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) -PRELU_F16_2D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8) -PRELU_F16_2D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16) -PRELU_F16_2D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8) -PRELU_F16_2D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8) -PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16) +PRELU_F16_2D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8) +PRELU_F16_2D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16) +PRELU_F16_2D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8) +PRELU_F16_2D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) +PRELU_F16_2D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) +PRELU_F16_2D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8) +PRELU_F16_2D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16) +PRELU_F16_2D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8) +PRELU_F16_2D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8) +PRELU_F16_2D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16) -#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \ - __kernel void prelu_U8U8to##name##_2D( \ +#define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \ + __kernel void prelu_##name##_2D( \ __read_only image2d_array_t input0, \ __read_only image2d_array_t input1, \ __write_only image2d_array_t output) \ {\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\ - vxc_float4 vecA, vecB, vecC, vecD;\ - vxc_uchar16 src0;\ - vxc_uchar16 src1;\ - vxc_uchar16 input_ZP0;\ - vxc_uchar16 input_ZP1;\ + float4 vecA, vecB, vecC, vecD;\ + src0_type src0;\ + src1_type src1;\ + short zp0;\ + short zp1;\ VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ \ - _viv_asm(COPY, input_ZP0, inputZP0, 4);\ - VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ - VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ - _viv_asm(COPY, input_ZP1, inputZP1, 4);\ - VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ - VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ + _viv_asm(COPY, zp0, input0_zp, 2);\ + VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ + VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ + _viv_asm(COPY, zp1, input1_zp, 4);\ + VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ + VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ \ - vecA = vecA * input_scale0;\ - vecB = vecB * input_scale0;\ - vecC = vecC * input_scale1;\ - vecD = vecD * input_scale1;\ - vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \ - vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \ - vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \ - vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \ - vecA = maxData0 + vecC * minData0 + outputZP;\ - vecB = maxData1 + vecD * minData1 + outputZP;\ + vecA = vecA * input0_scale;\ + vecB = vecB * input0_scale;\ + vecC = vecC * input1_scale;\ + vecD = vecD * input1_scale;\ + float4 maxData0 = vecA >= 0 ? vecA : 0.0; \ + float4 maxData1 = vecB >= 0 ? vecB : 0.0; \ + float4 minData0 = vecA < 0 ? vecA : 0.0; \ + float4 minData1 = vecB < 0 ? vecB : 0.0; \ + vecA = maxData0 + vecC * minData0 + output_zp;\ + vecB = maxData1 + vecD * minData1 + output_zp;\ convert_type dst0, dst1;\ _viv_asm(CONV_RTE, dst0, vecA);\ _viv_asm(CONV_RTE, dst1, vecB);\ @@ -268,7 +268,8 @@ PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha _viv_asm(COPY, dst, dst2, 16); \ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ } -PRELU_U8_2D(U8, vxc_uchar16, int4, vxc_uchar16) -PRELU_U8_2D(F16, vxc_half8, half4, vxc_short8) +PRELU_INTEGER_2D(U8U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) +PRELU_INTEGER_2D(U8U8toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx new file mode 100644 index 0000000..6fa4e3e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_3.vx @@ -0,0 +1,181 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8PostProcess_2x8; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniResize2xUp_0_4x8; +_viv_uniform VXC_512Bits uniResize2xUp_1_4x8; +_viv_uniform int out_height; + +__kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0); + coord_in.x = (coord_out.x * 2 - 1) >> 2; + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; + + vxc_uchar16 in0, in1, tmp, result; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + vxc_ushort8 multiplier; + _viv_asm(COPY, multiplier, multAndoutZP, 16); + + vxc_ushort8 dst0; + while (coord_out.y < out_height) + { + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.y += 2; + coord_out.y++; + } +} + +_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8; +_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8; +_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8; +_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8; +__kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0); + coord_in.x = (coord_out.x * 2 - 3) >> 3; + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; + + vxc_uchar16 in0, in1, dst0, dst1; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + vxc_ushort8 multiplier; + _viv_asm(COPY, multiplier, multAndoutZP, 16); + + vxc_ushort8 tmp; + while (coord_out.y < out_height) + { + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.y += 2; + coord_out.y++; + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx new file mode 100644 index 0000000..32c188f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_4.vx @@ -0,0 +1,102 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8PostProcess_2x8; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8; +_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8; +_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4; +_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4; +_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4; +_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4; +__kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6; + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; + coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6; + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y; + + vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + vxc_ushort8 multiplier; + _viv_asm(COPY, multiplier, multAndoutZP, 16); + + vxc_ushort8 data; + + VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4); + VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); + VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); + VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); + VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + + VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8); + VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8); + VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4); + VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); + VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); + VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); + VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4); + VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); + VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); + VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); + VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + + VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8); + VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8); + VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4); + VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); + VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); + VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); + VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx new file mode 100644 index 0000000..06ddcae --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers_5.vx @@ -0,0 +1,167 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8PostProcess_2x8; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp +_viv_uniform int out_height; +_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8; +_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8; +__kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0); + coord_in.x = (coord_out.x * 2 - 7) >> 4; + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; + + vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + vxc_ushort8 multiplier; + _viv_asm(COPY, multiplier, multAndoutZP, 16); + + vxc_ushort8 tmp; + while (coord_out.y < out_height) + { + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8); + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8); + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8); + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8); + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8); + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8); + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8); + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8); + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8); + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8); + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8); + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8); + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.y += 2; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8); + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8); + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8); + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8); + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8); + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8); + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx new file mode 100644 index 0000000..d321d79 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_0.vx @@ -0,0 +1,303 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float scale0; +_viv_uniform float scale1; +_viv_uniform float output_zp; +_viv_uniform int half_head_size; +_viv_uniform VXC_512Bits uniATimesB_0_4x4; +_viv_uniform VXC_512Bits uniATimesB_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +#define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \ +__kernel void rope_##name##_bnhs \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t cos_cache, \ + __read_only image2d_array_t sin_cache, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + int4 coord_out = coord_in; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + \ + src_type data0, data1; \ + src1_type cos, sin; \ + copy_type v0, v1; \ + dst_type dst; \ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, cos, v0, 16); \ + VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, sin, v1, 16); \ + coord_in.y += half_head_size; \ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + float4 data2, data3, data4, data5; \ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \ + VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \ + VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \ + data3 = data3 * scale0 - data5 * scale1 + output_zp; \ + \ + int4 dst0 = convert_int4_rte(data2); \ + int4 dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \ + VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \ + data2 = data2 * scale1 + data4 * scale0 + output_zp; \ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \ + \ + dst0 = convert_int4_rte(data2); \ + dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + coord_out.y += half_head_size; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8) +ROPE_BNHS_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8) +ROPE_BNHS_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8) +ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +ROPE_BNHS_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8) +ROPE_BNHS_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8) + +__kernel void rope_F16_F16toF16_bnhs + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_out = coord_in; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + vxc_short8 v0, v1, v2, v3, dst; + vxc_half8 data0, data1, cos, sin, dst2; + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, v0, 16); + VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, cos, v1, 16); + VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, sin, v2, 16); + coord_in.y += half_head_size; + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, v3, 16); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 data2, data3, data4, data5; + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); + VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); + VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); + data2 = data2 - data4; + data3 = data3 - data5; + + half4 dst0; + half4 dst1; + _viv_asm(CONV_RTE, dst0, data2); + _viv_asm(CONV_RTE, dst1, data3); + + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst2, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + + VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); + VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); + data2 = data2 * scale1 + data4 * scale0 + output_zp; + data3 = data3 * scale1 + data5 * scale0 + output_zp; + + _viv_asm(CONV_RTE, dst0, data2); + _viv_asm(CONV_RTE, dst1, data3); + + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst2, 16); + coord_out.y += half_head_size; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform int in0_zp; +_viv_uniform int cos_zp; +_viv_uniform int sin_zp; +_viv_uniform VXC_512Bits uniAMinusZp_0_4x4; +_viv_uniform VXC_512Bits uniAMinusZp_1_4x4; +#define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \ +__kernel void rope_##name##_bnhs \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t cos_cache, \ + __read_only image2d_array_t sin_cache, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + int4 coord_out = coord_in; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + \ + dtype data0, data1, dst; \ + src1_type cos, sin; \ + copy_type v0, v1; \ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, cos, v0, 16); \ + VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, sin, v1, 16); \ + coord_in.y += half_head_size; \ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + float4 l00, l01, cos0, cos1; \ + float4 l10, l11, sin0, sin1; \ + VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \ + float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \ + \ + int4 dst0 = convert_int4_rte(data2); \ + int4 dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, \ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \ + data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \ + \ + dst0 = convert_int4_rte(data2); \ + dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + coord_out.y += half_head_size; \ + VXC_OP4_NoDest(img_store_3d, output, \ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +ROPE_ASYM_BNHS(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8) +ROPE_ASYM_BNHS(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8) +ROPE_ASYM_BNHS(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8) +ROPE_ASYM_BNHS(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8) +ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8) + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; +__kernel void rope_BF16_BF16toBF16_bnhs + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_out = coord_in; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + vxc_ushort8 v0, v1, v2, v3, dst; + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_in.y += half_head_size; + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_short8 data; + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, src0, data, 16); + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src1, data, 16); + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, cos0, data, 16); + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, cos1, data, 16); + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, sin0, data, 16); + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, sin1, data, 16); + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, src2, data, 16); + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src3, data, 16); + + float4 data0 = src0 * cos0 - src2 * sin0; + float4 data1 = src1 * cos1 - src3 * sin1; + + _viv_asm(COPY, v0, data0, 16); + _viv_asm(COPY, v1, data1, 16); + + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + + data0 = src0 * sin0 + src2 * cos0; + data1 = src1 * sin1 + src3 * cos1; + + _viv_asm(COPY, v0, data0, 16); + _viv_asm(COPY, v1, data1, 16); + + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + coord_out.y += half_head_size; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx new file mode 100644 index 0000000..d2aab97 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_1.vx @@ -0,0 +1,245 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float scale0; +_viv_uniform float scale1; +_viv_uniform float output_zp; +_viv_uniform int half_head_size; +_viv_uniform VXC_512Bits uniATimesB_0_4x4; +_viv_uniform VXC_512Bits uniATimesB_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +#define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \ +__kernel void rope_##name##_bnh1 \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t cos_cache, \ + __read_only image2d_array_t sin_cache, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + src_type data0, data1; \ + src1_type cos, sin; \ + copy_type v0, v1; \ + VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, cos, v0, 16); \ + VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, sin, v1, 16); \ + coord.x += half_head_size; \ + VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 data2, data3, data4, data5; \ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \ + VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \ + VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \ + data3 = data3 * scale0 - data5 * scale1 + output_zp; \ + \ + int4 dst0 = convert_int4_rte(data2); \ + int4 dst1 = convert_int4_rte(data3); \ + \ + dst_type dst; \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \ + VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \ + data2 = data2 * scale1 + data4 * scale0 + output_zp; \ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \ + \ + dst0 = convert_int4_rte(data2); \ + dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8) +ROPE_BNH1_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8) +ROPE_BNH1_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8) +ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +ROPE_BNH1_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8) +ROPE_BNH1_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8) + +__kernel void rope_F16_F16toF16_bnh1 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + vxc_short8 v0, v1, v2, v3, dst; + vxc_half8 data0, data1, cos, sin, dst2; + VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, v0, 16); + VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, cos, v1, 16); + VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, sin, v2, 16); + coord.x += half_head_size; + VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, v3, 16); + + float4 data2, data3, data4, data5; + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); + VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); + VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); + data2 = data2 - data4; + data3 = data3 - data5; + + half4 dst0; + half4 dst1; + _viv_asm(CONV_RTE, dst0, data2); + _viv_asm(CONV_RTE, dst1, data3); + + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst2, 16); + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); + VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); + data2 = data2 + data4; + data3 = data3 + data5; + + _viv_asm(CONV_RTE, dst0, data2); + _viv_asm(CONV_RTE, dst1, data3); + + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst2, 16); + VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +_viv_uniform int in0_zp; +_viv_uniform int cos_zp; +_viv_uniform int sin_zp; +_viv_uniform VXC_512Bits uniAMinusZp_0_4x4; +_viv_uniform VXC_512Bits uniAMinusZp_1_4x4; +#define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \ +__kernel void rope_##name##_bnh1 \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t cos_cache, \ + __read_only image2d_array_t sin_cache, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + dtype data0, data1, dst; \ + src1_type cos, sin; \ + copy_type v0, v1; \ + VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, cos, v0, 16); \ + VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, sin, v1, 16); \ + coord.x += half_head_size; \ + VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 l00, l01, cos0, cos1; \ + float4 l10, l11, sin0, sin1; \ + VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \ + float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \ + \ + int4 dst0 = convert_int4_rte(data2); \ + int4 dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \ + data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \ + \ + dst0 = convert_int4_rte(data2); \ + dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +ROPE_ASYM_BNH1(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8) +ROPE_ASYM_BNH1(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8) +ROPE_ASYM_BNH1(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8) +ROPE_ASYM_BNH1(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8) +ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8) + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; +__kernel void rope_BF16_BF16toBF16_bnh1 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + vxc_ushort8 v0, v1, v2, v3, dst; + VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += half_head_size; + VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_short8 data; + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, src0, data, 16); + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src1, data, 16); + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, cos0, data, 16); + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, cos1, data, 16); + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, sin0, data, 16); + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, sin1, data, 16); + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, src2, data, 16); + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src3, data, 16); + + float4 data0 = src0 * cos0 - src2 * sin0; + float4 data1 = src1 * cos1 - src3 * sin1; + + _viv_asm(COPY, v0, data0, 16); + _viv_asm(COPY, v1, data1, 16); + + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + data0 = src0 * sin0 + src2 * cos0; + data1 = src1 * sin1 + src3 * cos1; + + _viv_asm(COPY, v0, data0, 16); + _viv_asm(COPY, v1, data1, 16); + + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx new file mode 100644 index 0000000..a77830b --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_2.vx @@ -0,0 +1,312 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float scale0; +_viv_uniform float scale1; +_viv_uniform float output_zp; +_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4; +_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4; +_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4; +_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +#define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \ +__kernel void rope_##name##_bsnh \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t cos_cache, \ + __read_only image2d_array_t sin_cache, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + \ + src_type data0, data1; \ + src1_type cos, sin; \ + copy_type v0, v1; \ + dst_type dst; \ + VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, cos, v0, 16); \ + VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, sin, v1, 16); \ + \ + coord_in.x *= 2; \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 coord_out = coord_in; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + float4 data2, data3, data4, data5; \ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \ + VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \ + VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \ + \ + int4 dst0 = convert_int4_rte(data2); \ + int4 dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \ + VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \ + \ + dst0 = convert_int4_rte(data2); \ + dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + coord_out.x += 8; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8) +ROPE_BSNH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8) +ROPE_BSNH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8) +ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +ROPE_BSNH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8) +ROPE_BSNH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8) + +__kernel void rope_F16_F16toF16_bsnh + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + + vxc_short8 v0, v1, v2, v3, dst; + vxc_half8 data0, data1, cos, sin, dst2; + VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, cos, v1, 16); + VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, sin, v2, 16); + + coord_in.x *= 2; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, v0, 16); + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, v3, 16); + + int4 coord_out = coord_in; + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 data2, data3, data4, data5; + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); + VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); + VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); + data2 = data2 - data4; + data3 = data3 + data5; + + half4 dst0; + half4 dst1; + _viv_asm(CONV_RTE, dst0, data2); + _viv_asm(CONV_RTE, dst1, data3); + + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst2, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + + VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); + VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); + data2 = data2 - data4; + data3 = data3 + data5; + + _viv_asm(CONV_RTE, dst0, data2); + _viv_asm(CONV_RTE, dst1, data3); + + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst2, 16); + coord_out.x += 8; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform int in0_zp; +_viv_uniform int cos_zp; +_viv_uniform int sin_zp; +_viv_uniform VXC_512Bits uniAMinusZp_0_4x4; +_viv_uniform VXC_512Bits uniAMinusZp_1_4x4; +_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4; +_viv_uniform VXC_512Bits uniAOddMinusZp_4x4; +#define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \ +__kernel void rope_##name##_bsnh \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t cos_cache, \ + __read_only image2d_array_t sin_cache, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + \ + dtype data0, data1, dst; \ + src1_type cos, sin; \ + copy_type v0, v1; \ + \ + VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, cos, v0, 16); \ + VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, sin, v1, 16); \ + coord_in.x *= 2; \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 coord_out = coord_in; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + float4 l00, l01, cos0, cos1; \ + float4 l10, l11, sin0, sin1; \ + VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \ + VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \ + VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \ + float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \ + \ + int4 dst0 = convert_int4_rte(data2); \ + int4 dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, \ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \ + VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \ + data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \ + data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \ + \ + dst0 = convert_int4_rte(data2); \ + dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + coord_out.x += 8; \ + VXC_OP4_NoDest(img_store_3d, output, \ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +ROPE_ASYM_BSNH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8) +ROPE_ASYM_BSNH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8) +ROPE_ASYM_BSNH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8) +ROPE_ASYM_BSNH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8) +ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8) + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; +__kernel void rope_BF16_BF16toBF16_bsnh + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + + vxc_ushort8 v0, v1, v2, v3, dst; + VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.x *= 2; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + int4 coord_out = coord_in; + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_short8 data; + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, src0, data, 16); + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src1, data, 16); + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, cos0, data, 16); + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, cos1, data, 16); + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, sin0, data, 16); + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, sin1, data, 16); + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, src2, data, 16); + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src3, data, 16); + + float4 even = (float4)(src0.xz, src1.xz); + float4 odd = (float4)(src0.yw, src1.yw); + float4 data0 = even * cos0 - odd * sin0; + float4 data1 = even * sin0 + odd * cos0; + + _viv_asm(COPY, v0, data0, 16); + _viv_asm(COPY, v1, data1, 16); + + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + + even = (float4)(src2.xz, src3.xz); + odd = (float4)(src2.yw, src3.yw); + data0 = even * cos1 - odd * sin1; + data1 = even * sin1 + odd * cos1; + + _viv_asm(COPY, v0, data0, 16); + _viv_asm(COPY, v1, data1, 16); + + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + coord_out.x += 8; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx new file mode 100644 index 0000000..3fb11f9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/rope_3.vx @@ -0,0 +1,312 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float scale0; +_viv_uniform float scale1; +_viv_uniform float output_zp; +_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4; +_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4; +_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4; +_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +#define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \ +__kernel void rope_##name##_bnsh \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t cos_cache, \ + __read_only image2d_array_t sin_cache, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + \ + src_type data0, data1; \ + src1_type cos, sin; \ + copy_type v0, v1; \ + dst_type dst; \ + VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, cos, v0, 16); \ + VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, sin, v1, 16); \ + \ + coord_in.x *= 2; \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 coord_out = coord_in; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + float4 data2, data3, data4, data5; \ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \ + VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \ + VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \ + \ + int4 dst0 = convert_int4_rte(data2); \ + int4 dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \ + VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \ + \ + dst0 = convert_int4_rte(data2); \ + dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + coord_out.x += 8; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8) +ROPE_BNSH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8) +ROPE_BNSH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8) +ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +ROPE_BNSH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8) +ROPE_BNSH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8) + +__kernel void rope_F16_F16toF16_bnsh + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + + vxc_short8 v0, v1, v2, v3, dst; + vxc_half8 data0, data1, cos, sin, dst2; + VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, cos, v1, 16); + VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, sin, v2, 16); + + coord_in.x *= 2; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data0, v0, 16); + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data1, v3, 16); + + int4 coord_out = coord_in; + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 data2, data3, data4, data5; + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); + VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); + VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); + data2 = data2 - data4; + data3 = data3 + data5; + + half4 dst0; + half4 dst1; + _viv_asm(CONV_RTE, dst0, data2); + _viv_asm(CONV_RTE, dst1, data3); + + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst2, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + + VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); + VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); + data2 = data2 - data4; + data3 = data3 + data5; + + _viv_asm(CONV_RTE, dst0, data2); + _viv_asm(CONV_RTE, dst1, data3); + + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + _viv_asm(COPY, dst, dst2, 16); + coord_out.x += 8; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform int in0_zp; +_viv_uniform int cos_zp; +_viv_uniform int sin_zp; +_viv_uniform VXC_512Bits uniAMinusZp_0_4x4; +_viv_uniform VXC_512Bits uniAMinusZp_1_4x4; +_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4; +_viv_uniform VXC_512Bits uniAOddMinusZp_4x4; +#define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \ +__kernel void rope_##name##_bnsh \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t cos_cache, \ + __read_only image2d_array_t sin_cache, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + \ + dtype data0, data1, dst; \ + src1_type cos, sin; \ + copy_type v0, v1; \ + \ + VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, cos, v0, 16); \ + VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, sin, v1, 16); \ + coord_in.x *= 2; \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 coord_out = coord_in; \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + \ + float4 l00, l01, cos0, cos1; \ + float4 l10, l11, sin0, sin1; \ + VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \ + VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \ + VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \ + VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \ + float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \ + float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \ + \ + int4 dst0 = convert_int4_rte(data2); \ + int4 dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, \ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \ + VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \ + data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \ + data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \ + \ + dst0 = convert_int4_rte(data2); \ + dst1 = convert_int4_rte(data3); \ + \ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + coord_out.x += 8; \ + VXC_OP4_NoDest(img_store_3d, output, \ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +ROPE_ASYM_BNSH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8) +ROPE_ASYM_BNSH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8) +ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8) +ROPE_ASYM_BNSH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8) +ROPE_ASYM_BNSH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8) +ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8) + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; +__kernel void rope_BF16_BF16toBF16_bnsh + ( + __read_only image2d_array_t input, + __read_only image2d_array_t cos_cache, + __read_only image2d_array_t sin_cache, + __write_only image2d_array_t output, + int axis + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); + + vxc_ushort8 v0, v1, v2, v3, dst; + VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + coord_in.x *= 2; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0, + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + int4 coord_out = coord_in; + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_short8 data; + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, src0, data, 16); + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src1, data, 16); + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, cos0, data, 16); + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, cos1, data, 16); + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, sin0, data, 16); + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, sin1, data, 16); + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, src2, data, 16); + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, src3, data, 16); + + float4 even = (float4)(src0.xz, src1.xz); + float4 odd = (float4)(src0.yw, src1.yw); + float4 data0 = even * cos0 - odd * sin0; + float4 data1 = even * sin0 + odd * cos0; + + _viv_asm(COPY, v0, data0, 16); + _viv_asm(COPY, v1, data1, 16); + + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + + even = (float4)(src2.xz, src3.xz); + odd = (float4)(src2.yw, src3.yw); + data0 = even * cos1 - odd * sin1; + data1 = even * sin1 + odd * cos1; + + _viv_asm(COPY, v0, data0, 16); + _viv_asm(COPY, v1, data1, 16); + + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + coord_out.x += 8; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx index 1118356..7e2970a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx @@ -93,3 +93,101 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \ } SCATTER_ND_UPDATE_COPY2OUT(U8, vxc_uchar16, 1) SCATTER_ND_UPDATE_COPY2OUT(I8, vxc_char16, 1) +SCATTER_ND_UPDATE_COPY2OUT(U16, vxc_ushort8, 2) +SCATTER_ND_UPDATE_COPY2OUT(I16, vxc_short8, 2) + +#define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \ +__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \ + __read_only image2d_t input_ref, \ + image2d_t temp_ref, \ + image2d_t output0 \ + ) \ +{ \ + int gidx = get_global_id(0); \ + Image img0 = create_image_from_image2d(input_ref, 2); \ + Image img1 = create_image_from_image2d(temp_ref, 2); \ + __global data_type* in_ptr = (__global data_type*)img0.ptr; \ + __global data_type* out_ptr = (__global data_type*)img1.ptr; \ + data_type src, dst; \ + src = in_ptr[gidx]; \ + vxc_ushort8 mp0; \ + _viv_asm(COPY, mp0, multAndoutZP0, 16); \ + VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift0_Lo_2x8); \ + out_ptr[gidx] = dst; \ +} +SCATTER_ND_UPDATE_REF2OUT_16BITS(U16, vxc_ushort8) +SCATTER_ND_UPDATE_REF2OUT_16BITS(I16, vxc_short8) + +#define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \ +__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \ + __read_only image2d_t input_index, \ + __read_only image2d_t input_update, \ + image2d_t temp_ref, \ + image2d_t input0, \ + image2d_t output1, \ + int width, int area, int vol, int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + Image img1 = create_image_from_image2d(input_index, 4); \ + Image img2 = create_image_from_image2d(input_update, 2); \ + Image img3 = create_image_from_image2d(temp_ref, 2); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global data_type* update_ptr = (__global data_type*)img2.ptr; \ + __global data_type* output_ptr = (__global data_type*)img3.ptr; \ + data_type dst; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \ + data_type src = update_ptr[gidy * update_width + gidx]; \ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \ + int loc = idx * output_width + gidx; \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift1_Lo_2x8); \ + output_ptr[loc] = dst; \ +} +SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16, vxc_ushort8) +SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16, vxc_short8) + +__kernel void scatter_nd_update_ref2out_F16toF16( + __read_only image2d_t input_ref, + image2d_t temp_ref, + image2d_t output0 + ) +{ + int gidx = get_global_id(0); + Image img0 = create_image_from_image2d(input_ref, 2); + Image img1 = create_image_from_image2d(temp_ref, 2); + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr; + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr; + out_ptr[gidx] = in_ptr[gidx]; +} + +__kernel void scatter_nd_update_update2ref_F16toF16_16x( + __read_only image2d_t input_index, + __read_only image2d_t input_update, + image2d_t temp_ref, + image2d_t input0, + image2d_t output1, + int width, int area, int vol, int coord_dim + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + + Image img1 = create_image_from_image2d(input_index, 4); + Image img2 = create_image_from_image2d(input_update, 2); + Image img3 = create_image_from_image2d(temp_ref, 2); + __global int* index_ptr = (__global int*)img1.ptr; + __global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr; + __global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr; + + int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; + int loc = idx * output_width + gidx; + output_ptr[loc] = update_ptr[gidy * update_width + gidx]; +} diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 5d4159a..9503736 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -9841,6 +9841,315 @@ CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)\n\ CUMSUM_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16)\n\ "; /* end of cumsum_f16_u8_vx*/ +static const char custom_letterbox_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int top;\n\ +_viv_uniform int left;\n\ +_viv_uniform float out_scale_r;\n\ +_viv_uniform float out_scale_g;\n\ +_viv_uniform float out_scale_b;\n\ +_viv_uniform float out_zp_r;\n\ +_viv_uniform float out_zp_g;\n\ +_viv_uniform float out_zp_b;\n\ +_viv_uniform float pad_v_r;\n\ +_viv_uniform float pad_v_g;\n\ +_viv_uniform float pad_v_b;\n\ +_viv_uniform float scale_w;\n\ +_viv_uniform float scale_h;\n\ +_viv_uniform int resize_max_w;\n\ +_viv_uniform int resize_max_h;\n\ +_viv_uniform int out_height;\n\ +_viv_uniform int r_order;\n\ +_viv_uniform int b_order;\n\ +_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\ +_viv_uniform VXC_512Bits uniLeftToFloat32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +__kernel void custom_letterbox_U8toU8\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int top_,\n\ + int bottom_,\n\ + int left_,\n\ + int right_,\n\ + float mean_r_,\n\ + float mean_g_,\n\ + float mean_b_,\n\ + float scale_r_,\n\ + float scale_g_,\n\ + float scale_b_,\n\ + int pad_r_,\n\ + int pad_g_,\n\ + int pad_b_,\n\ + int reverse_channel\n\ + )\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord = coord_out;\n\ + uint4 dst = (uint4)(0,0,0,0);\n\ + vxc_uchar8 result;\n\ +\n\ + if (coord_out.x < left || coord_out.x >= resize_max_w ||\n\ + coord_out.y < top || coord_out.y >= resize_max_h)\n\ + {\n\ + dst.x = convert_uint(pad_v_r);\n\ + coord.y = coord_out.y + r_order;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + dst.x = convert_uint(pad_v_g);\n\ + coord.y = coord_out.y + out_height;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + dst.x = convert_uint(pad_v_b);\n\ + coord.y = coord_out.y + b_order;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + return;\n\ + }\n\ +\n\ + float in_x = convert_float(coord_out.x - left) * scale_w;\n\ + float in_y = convert_float(coord_out.y - top) * scale_h;\n\ + float left_x_f = floor(in_x);\n\ + float top_y_f = floor(in_y);\n\ + float x_lerp = in_x - left_x_f;\n\ + float y_lerp = in_y - top_y_f;\n\ + int left_x_idx = convert_int(left_x_f);\n\ + int top_y_idx = convert_int(top_y_f);\n\ +\n\ + int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);\n\ + vxc_uchar8 top_data, bottom_data;\n\ +\n\ + VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4 = (float4)(0,0,0,0);\n\ + float4 right4 = (float4)(0,0,0,0);\n\ + float4 top4 = (float4)(0,0,0,0);\n\ + float4 bottom4 = (float4)(0,0,0,0);\n\ + VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + float4 out = (bottom4 - top4) * y_lerp + top4;\n\ +\n\ + dst.x = convert_uint(out.x * out_scale_r + out_zp_r );\n\ + coord.y = coord_out.y + r_order;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + dst.x = convert_uint(out.y * out_scale_g + out_zp_g);\n\ + coord.y = coord_out.y + out_height;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + dst.x = convert_uint(out.z * out_scale_b + out_zp_b);\n\ + coord.y = coord_out.y + b_order;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_letterbox_U8toI8\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int top_,\n\ + int bottom_,\n\ + int left_,\n\ + int right_,\n\ + float mean_r_,\n\ + float mean_g_,\n\ + float mean_b_,\n\ + float scale_r_,\n\ + float scale_g_,\n\ + float scale_b_,\n\ + int pad_r_,\n\ + int pad_g_,\n\ + int pad_b_,\n\ + int reverse_channel\n\ + )\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord = coord_out;\n\ + int4 dst = (int4)(0,0,0,0);\n\ + vxc_char8 result;\n\ +\n\ + if (coord_out.x < left || coord_out.x >= resize_max_w ||\n\ + coord_out.y < top || coord_out.y >= resize_max_h)\n\ + {\n\ + dst.x = convert_int(pad_v_r);\n\ + coord.y = coord_out.y + r_order;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + dst.x = convert_int(pad_v_g);\n\ + coord.y = coord_out.y + out_height;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + dst.x = convert_int(pad_v_b);\n\ + coord.y = coord_out.y + b_order;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + return;\n\ + }\n\ +\n\ + float in_x = convert_float(coord_out.x - left) * scale_w;\n\ + float in_y = convert_float(coord_out.y - top) * scale_h;\n\ + float left_x_f = floor(in_x);\n\ + float top_y_f = floor(in_y);\n\ + float x_lerp = in_x - left_x_f;\n\ + float y_lerp = in_y - top_y_f;\n\ + int left_x_idx = convert_int(left_x_f);\n\ + int top_y_idx = convert_int(top_y_f);\n\ +\n\ + int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);\n\ + vxc_char8 top_data, bottom_data;\n\ +\n\ + VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4 = (float4)(0,0,0,0);\n\ + float4 right4 = (float4)(0,0,0,0);\n\ + float4 top4 = (float4)(0,0,0,0);\n\ + float4 bottom4 = (float4)(0,0,0,0);\n\ + VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + float4 out = (bottom4 - top4) * y_lerp + top4;\n\ +\n\ + dst.x = convert_int(out.x * out_scale_r + out_zp_r);\n\ + coord.y = coord_out.y + r_order;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + dst.x = convert_int(out.y * out_scale_g + out_zp_g);\n\ + coord.y = coord_out.y + out_height;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + dst.x = convert_int(out.z * out_scale_b + out_zp_b);\n\ + coord.y = coord_out.y + b_order;\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);\n\ + VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_letterbox_U8toF16\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int top_,\n\ + int bottom_,\n\ + int left_,\n\ + int right_,\n\ + float mean_r_,\n\ + float mean_g_,\n\ + float mean_b_,\n\ + float scale_r_,\n\ + float scale_g_,\n\ + float scale_b_,\n\ + int pad_r_,\n\ + int pad_g_,\n\ + int pad_b_,\n\ + int reverse_channel\n\ + )\n\ +{\n\ + int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord = coord_out;\n\ + half4 tmp;\n\ + vxc_half8 dst_temp;\n\ + vxc_ushort8 dst;\n\ +\n\ + if (coord_out.x < left || coord_out.x >= resize_max_w ||\n\ + coord_out.y < top || coord_out.y >= resize_max_h)\n\ + {\n\ +\n\ + float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0);\n\ + _viv_asm(CONV, tmp, pad);\n\ + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst, dst_temp, 16);\n\ + coord.y = coord_out.y + r_order;\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + tmp.x = tmp.y;\n\ + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst, dst_temp, 16);\n\ + coord.y = coord_out.y + out_height;\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + tmp.x = tmp.z;\n\ + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst, dst_temp, 16);\n\ + coord.y = coord_out.y + b_order;\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + return;\n\ + }\n\ +\n\ + float in_x = convert_float(coord_out.x - left) * scale_w;\n\ + float in_y = convert_float(coord_out.y - top) * scale_h;\n\ + float left_x_f = floor(in_x);\n\ + float top_y_f = floor(in_y);\n\ + float x_lerp = in_x - left_x_f;\n\ + float y_lerp = in_y - top_y_f;\n\ + int left_x_idx = convert_int(left_x_f);\n\ + int top_y_idx = convert_int(top_y_f);\n\ +\n\ + int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);\n\ + vxc_uchar8 top_data, bottom_data;\n\ +\n\ + VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 left4 = (float4)(0,0,0,0);\n\ + float4 right4 = (float4)(0,0,0,0);\n\ + float4 top4 = (float4)(0,0,0,0);\n\ + float4 bottom4 = (float4)(0,0,0,0);\n\ + VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\ + top4 = right4 * x_lerp + left4;\n\ + VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\ + VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);\n\ + bottom4 = right4 * x_lerp + left4;\n\ + float4 out = (bottom4 - top4) * y_lerp + top4;\n\ +\n\ + float4 out_temp = (float4)(0,0,0,0);\n\ + out_temp.x = out.x * out_scale_r + out_zp_r;\n\ + _viv_asm(CONV, tmp, out_temp);\n\ + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst, dst_temp, 16);\n\ + coord.y = coord_out.y + r_order;\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + out_temp.x = out.y * out_scale_g + out_zp_g;\n\ + _viv_asm(CONV, tmp, out_temp);\n\ + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst, dst_temp, 16);\n\ + coord.y = coord_out.y + out_height;\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + out_temp.x = out.z * out_scale_b + out_zp_b;\n\ + _viv_asm(CONV, tmp, out_temp);\n\ + VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);\n\ + _viv_asm(COPY, dst, dst_temp, 16);\n\ + coord.y = coord_out.y + out_height;\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of custom_letterbox_vx*/ + static const char custom_softmax_vx[] = "/*\n\ ============================================================================\n\ Name : Softmax2.vx\n\ @@ -9853,7 +10162,12 @@ static const char custom_softmax_vx[] = "/*\n\ #include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;\n\ +_viv_uniform VXC_512Bits uniExtract8Bin_2x8;\n\ _viv_uniform int sf_size;\n\ +_viv_uniform float srcScale;\n\ +_viv_uniform float srcZP;\n\ +_viv_uniform float dstScale;\n\ +_viv_uniform float dstZP;\n\ #define F_MAX(a,b) ((a)>(b)?(a):(b))\n\ __kernel void Softmax2VXC\n\ (\n\ @@ -9862,35 +10176,37 @@ __kernel void Softmax2VXC\n\ int axis\n\ )\n\ {\n\ - int4 coord_in = (int4)(0,0,0,0);\n\ - float fMax = 0.0;\n\ + int4 coord_in = (int4)(0, get_global_id(0), 0, 0);\n\ + float fMax = 0;\n\ for (int i = 0; i < sf_size; i++)\n\ {\n\ - vxc_char8 val;\n\ + vxc_short8 val;\n\ + vxc_half8 val_h;\n\ coord_in.x = i;\n\ - VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, val_h, val, 16);\n\ float fval;\n\ - VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ + VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ \n\ fMax = F_MAX(fMax, fval);\n\ }\n\ -\n\ float fProbSum = 0.0f;\n\ vxc_short8 dst;\n\ for (int i = 0; i < sf_size; i++)\n\ {\n\ - vxc_char8 val;\n\ -\n\ + vxc_short8 val;\n\ + vxc_half8 val_h;\n\ coord_in.x = i;\n\ - VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, val_h, val, 16);\n\ float fval;\n\ - VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ -\n\ + VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ float fOut = (float)exp(fval - fMax);\n\ fProbSum += fOut;\n\ half hVal;\n\ - _viv_asm(CONV,hVal,fOut);\n\ - _viv_asm(COPY,dst,hVal, 4);\n\ + _viv_asm(CONV, hVal, fOut);\n\ + _viv_asm(COPY, dst, hVal, 4);\n\ +\n\ VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ @@ -9899,19 +10215,71 @@ __kernel void Softmax2VXC\n\ vxc_short8 val;\n\ vxc_half8 val_h;\n\ coord_in.x = i;\n\ - VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ float fval;\n\ _viv_asm(COPY, val_h,val, 16);\n\ VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ -\n\ - float fOut =fval/fProbSum;\n\ + float fOut =fval / fProbSum;\n\ half hVal;\n\ - _viv_asm(CONV,hVal,fOut);\n\ + _viv_asm(CONV, hVal, fOut);\n\ _viv_asm(COPY,dst,hVal, 4);\n\ VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ -"; /* end of custom_softmax_vx*/ +\n\ +__kernel void Softmax2VXC_u8\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(0, get_global_id(0), 0, 0);\n\ + float fMax = -3.4e38f;\n\ + for (int i = 0; i < sf_size; i++)\n\ + {\n\ + vxc_uchar8 val;\n\ + coord_in.x = i;\n\ + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + float fval;\n\ + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ + fval = (fval - srcZP) * srcScale;\n\ + fMax = F_MAX(fMax, fval);\n\ + }\n\ +\n\ + float fProbSum = 0.0f;\n\ + vxc_uchar8 dst;\n\ + for (int i = 0; i < sf_size; i++)\n\ + {\n\ + vxc_uchar8 val;\n\ +\n\ + coord_in.x = i;\n\ + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + float fval;\n\ + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ + fval = (fval - srcZP) * srcScale;\n\ + float fOut = (float)exp(fval - fMax);\n\ + fProbSum += fOut;\n\ + }\n\ +\n\ + for (int i = 0; i < sf_size; i++)\n\ + {\n\ + vxc_uchar8 val;\n\ + coord_in.x = i;\n\ + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + float fval;\n\ + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ + fval = (fval - srcZP) * srcScale;\n\ +\n\ + float fOut = exp(fval - fMax) / fProbSum;\n\ +\n\ + fOut = fOut * dstScale + dstZP;\n\ + short dst0;\n\ + _viv_asm(CONV, dst0, fOut);\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8);\n\ + VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of custom_softmax_vx*/ static const char custom_warp_affine_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ \n\ @@ -18077,7 +18445,7 @@ _viv_uniform float sum_x2_tail1;\n\ _viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ \n\ -#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \\\n\ +#define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \\\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ @@ -18087,7 +18455,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## int lidx = get_local_id(0); \\\n\ int gidz = get_global_id(1); \\\n\ int4 coord = (int4)(gidx, 0, gidz, 0); \\\n\ - vxc_short8 src0; \\\n\ + load_type src; \\\n\ src_type in_h; \\\n\ float4 sumsqr; \\\n\ float4 tmpSumSqr = (float4)(0); \\\n\ @@ -18104,9 +18472,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## { \\\n\ for(coord.y = 0; coord.y < height;) \\\n\ { \\\n\ - VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ coord.y++; \\\n\ - _viv_asm(COPY, in_h, src0, 16); \\\n\ + _viv_asm(COPY, in_h, src, 16); \\\n\ VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\ tmpSumSqr += sumsqr; \\\n\ } \\\n\ @@ -18137,10 +18505,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## write_imagef(output, coord_out, data); \\\n\ } \\\n\ }\n\ -GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)\n\ -GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)\n\ +GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8, vxc_half8)\n\ +GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8, vxc_short8)\n\ +GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8)\n\ \n\ -#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \\\n\ +#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \\\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ @@ -18150,7 +18519,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## int lidx = get_local_id(0); \\\n\ \\\n\ int2 coord = (int2)(gidx, get_global_id(1)); \\\n\ - vxc_short8 src0; \\\n\ + load_type src; \\\n\ src_type in_h; \\\n\ float4 sumsqr = (float4)(0); \\\n\ \\\n\ @@ -18159,8 +18528,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## \\\n\ if(gidx < width) \\\n\ { \\\n\ - VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, in_h, src, 16); \\\n\ VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\ sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \\\n\ sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \\\n\ @@ -18189,8 +18558,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_## write_imagef(output, coord_out, data); \\\n\ } \\\n\ }\n\ -GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)\n\ -GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)\n\ +GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8)\n\ +GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8)\n\ +GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8)\n\ \n\ #define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\ @@ -18239,7 +18609,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ norm = alpha * tmpData1 + bias_val; \\\n\ _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, outval, dst, 16); \\\n\ VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ @@ -18291,10 +18661,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ float4 norm; \\\n\ norm = alpha * tmpData0 + bias_val; \\\n\ - _viv_asm(CONV, tmpVal0, norm); \\\n\ + _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ norm = alpha * tmpData1 + bias_val; \\\n\ - _viv_asm(CONV, tmpVal1, norm); \\\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, outval, dst, 16); \\\n\ VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ @@ -18344,7 +18714,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\ float4 norm; \\\n\ norm = alpha * tmpData0 + bias_val; \\\n\ - _viv_asm(CONV, tmpVal0, norm); \\\n\ + _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ norm = alpha * tmpData1 + bias_val; \\\n\ _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ @@ -18357,6 +18727,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)\n\ \n\ #define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\ @@ -18394,10 +18765,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ float4 norm; \\\n\ norm = alpha * tmpData0 + bias_val; \\\n\ - _viv_asm(CONV, tmpVal0, norm); \\\n\ + _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ norm = alpha * tmpData1 + bias_val; \\\n\ - _viv_asm(CONV, tmpVal1, norm); \\\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, outval, dst, 16); \\\n\ VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ @@ -18407,6 +18778,7 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)\n\ \n\ "; /* end of group_normalization_2_vx*/ @@ -48227,45 +48599,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;\n\ _viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;\n\ _viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;\n\ _viv_uniform VXC_512Bits uniExtact8Bin_2x8;\n\ -_viv_uniform int inputZP0;\n\ -_viv_uniform int inputZP1;\n\ -_viv_uniform float input_scale0;\n\ -_viv_uniform float input_scale1;\n\ -_viv_uniform float outputZP;\n\ -#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\ - __kernel void prelu_##name0##to##name1( \\\n\ +_viv_uniform int input0_zp;\n\ +_viv_uniform int input1_zp;\n\ +_viv_uniform float input0_scale;\n\ +_viv_uniform float input1_scale;\n\ +_viv_uniform float output_zp;\n\ +#define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\ + __kernel void prelu_##name( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ __write_only image2d_array_t output) \\\n\ {\\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\\\n\ - vxc_float4 vecA, vecB, vecC, vecD;\\\n\ + float4 vecA, vecB, vecC, vecD;\\\n\ input_type0 srcA;\\\n\ copy_type0 src0;\\\n\ vxc_short8 srcB;\\\n\ vxc_half8 src1;\\\n\ - input_type0 input_ZP;\\\n\ + input_type0 zp;\\\n\ VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ _viv_asm(COPY, src0, srcA, 16); \\\n\ VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ _viv_asm(COPY, src1, srcB, 16); \\\n\ \\\n\ - _viv_asm(COPY, input_ZP, inputZP0, 4);\\\n\ - VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\ + _viv_asm(COPY, zp, input0_zp, 4);\\\n\ + VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\ uniDataSubZPtoFp32Part0_4x4); \\\n\ - VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\ + VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \\\n\ uniDataSubZPtoFp32Part1_4x4);\\\n\ VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\\\n\ VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\\\n\ \\\n\ - vecA = vecA * input_scale0;\\\n\ - vecB = vecB * input_scale0;\\\n\ - vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\ - vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\ - vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\ - vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\ - vecA = maxData0 + vecC * minData0 + outputZP;\\\n\ - vecB = maxData1 + vecD * minData1 + outputZP;\\\n\ + vecA = vecA * input0_scale;\\\n\ + vecB = vecB * input0_scale;\\\n\ + float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\ + float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\ + float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\ + float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\ + vecA = maxData0 + vecC * minData0 + output_zp;\\\n\ + vecB = maxData1 + vecD * minData1 + output_zp;\\\n\ convert_type dst0, dst1;\\\n\ _viv_asm(CONV_RTE, dst0, vecA);\\\n\ _viv_asm(CONV_RTE, dst1, vecB);\\\n\ @@ -48276,49 +48648,49 @@ _viv_uniform float outputZP;\n\ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ }\n\ // name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type\n\ -PRELU_F16_3D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)\n\ -PRELU_F16_3D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)\n\ -PRELU_F16_3D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)\n\ -PRELU_F16_3D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)\n\ -PRELU_F16_3D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)\n\ -PRELU_F16_3D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)\n\ -PRELU_F16_3D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)\n\ -PRELU_F16_3D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)\n\ -PRELU_F16_3D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)\n\ -PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)\n\ +PRELU_F16_3D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)\n\ +PRELU_F16_3D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_3D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)\n\ +PRELU_F16_3D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_3D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)\n\ +PRELU_F16_3D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_3D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_3D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)\n\ +PRELU_F16_3D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)\n\ +PRELU_F16_3D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)\n\ \n\ -#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\ - __kernel void prelu_##name0##to##name1##_2D( \\\n\ +#define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \\\n\ + __kernel void prelu_##name##_2D( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ __write_only image2d_array_t output) \\\n\ {\\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\ - vxc_float4 vecA, vecB, vecC, vecD;\\\n\ + float4 vecA, vecB, vecC, vecD;\\\n\ input_type0 srcA;\\\n\ copy_type0 src0;\\\n\ vxc_short8 srcB;\\\n\ vxc_half8 src1;\\\n\ - input_type0 input_ZP;\\\n\ + input_type0 zp;\\\n\ VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ _viv_asm(COPY, src0, srcA, 16); \\\n\ VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ _viv_asm(COPY, src1, srcB, 16); \\\n\ \\\n\ - _viv_asm(COPY, input_ZP, inputZP0, 4);\\\n\ - VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\ - VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\ + _viv_asm(COPY, zp, input0_zp, 4);\\\n\ + VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\ + VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\ VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\\\n\ VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\\\n\ \\\n\ - vecA = vecA * input_scale0;\\\n\ - vecB = vecB * input_scale0;\\\n\ - vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\ - vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\ - vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\ - vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\ - vecA = maxData0 + vecC * minData0 + outputZP;\\\n\ - vecB = maxData1 + vecD * minData1 + outputZP;\\\n\ + vecA = vecA * input0_scale;\\\n\ + vecB = vecB * input0_scale;\\\n\ + float4 maxData0 = vecA > 0 ? vecA : 0.0; \\\n\ + float4 maxData1 = vecB > 0 ? vecB : 0.0; \\\n\ + float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\ + float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\ + vecA = maxData0 + vecC * minData0 + output_zp;\\\n\ + vecB = maxData1 + vecD * minData1 + output_zp;\\\n\ convert_type dst0, dst1;\\\n\ _viv_asm(CONV_RTE, dst0, vecA);\\\n\ _viv_asm(CONV_RTE, dst1, vecB);\\\n\ @@ -48328,49 +48700,49 @@ PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha _viv_asm(COPY, dst, dst2, 16); \\\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ }\n\ -PRELU_F16_2D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)\n\ -PRELU_F16_2D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)\n\ -PRELU_F16_2D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)\n\ -PRELU_F16_2D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)\n\ -PRELU_F16_2D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)\n\ -PRELU_F16_2D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)\n\ -PRELU_F16_2D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)\n\ -PRELU_F16_2D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)\n\ -PRELU_F16_2D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)\n\ -PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)\n\ +PRELU_F16_2D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_2D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)\n\ +PRELU_F16_2D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_2D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)\n\ +PRELU_F16_2D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_2D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)\n\ +PRELU_F16_2D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)\n\ +PRELU_F16_2D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)\n\ +PRELU_F16_2D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)\n\ +PRELU_F16_2D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)\n\ \n\ -#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \\\n\ - __kernel void prelu_U8U8to##name##_2D( \\\n\ +#define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \\\n\ + __kernel void prelu_##name##_2D( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ __write_only image2d_array_t output) \\\n\ {\\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\\\n\ - vxc_float4 vecA, vecB, vecC, vecD;\\\n\ - vxc_uchar16 src0;\\\n\ - vxc_uchar16 src1;\\\n\ - vxc_uchar16 input_ZP0;\\\n\ - vxc_uchar16 input_ZP1;\\\n\ + float4 vecA, vecB, vecC, vecD;\\\n\ + src0_type src0;\\\n\ + src1_type src1;\\\n\ + short zp0;\\\n\ + short zp1;\\\n\ VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ \\\n\ - _viv_asm(COPY, input_ZP0, inputZP0, 4);\\\n\ - VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\ - VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\ - _viv_asm(COPY, input_ZP1, inputZP1, 4);\\\n\ - VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\ - VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\ + _viv_asm(COPY, zp0, input0_zp, 2);\\\n\ + VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\ + VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\ + _viv_asm(COPY, zp1, input1_zp, 4);\\\n\ + VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\\\n\ + VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\\\n\ \\\n\ - vecA = vecA * input_scale0;\\\n\ - vecB = vecB * input_scale0;\\\n\ - vecC = vecC * input_scale1;\\\n\ - vecD = vecD * input_scale1;\\\n\ - vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \\\n\ - vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \\\n\ - vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\ - vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\ - vecA = maxData0 + vecC * minData0 + outputZP;\\\n\ - vecB = maxData1 + vecD * minData1 + outputZP;\\\n\ + vecA = vecA * input0_scale;\\\n\ + vecB = vecB * input0_scale;\\\n\ + vecC = vecC * input1_scale;\\\n\ + vecD = vecD * input1_scale;\\\n\ + float4 maxData0 = vecA >= 0 ? vecA : 0.0; \\\n\ + float4 maxData1 = vecB >= 0 ? vecB : 0.0; \\\n\ + float4 minData0 = vecA < 0 ? vecA : 0.0; \\\n\ + float4 minData1 = vecB < 0 ? vecB : 0.0; \\\n\ + vecA = maxData0 + vecC * minData0 + output_zp;\\\n\ + vecB = maxData1 + vecD * minData1 + output_zp;\\\n\ convert_type dst0, dst1;\\\n\ _viv_asm(CONV_RTE, dst0, vecA);\\\n\ _viv_asm(CONV_RTE, dst1, vecB);\\\n\ @@ -48380,8 +48752,9 @@ PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha _viv_asm(COPY, dst, dst2, 16); \\\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\\\n\ }\n\ -PRELU_U8_2D(U8, vxc_uchar16, int4, vxc_uchar16)\n\ -PRELU_U8_2D(F16, vxc_half8, half4, vxc_short8)\n\ +PRELU_INTEGER_2D(U8U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)\n\ +PRELU_INTEGER_2D(U8U8toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)\n\ +\n\ \n\ \n\ "; /* end of prelu_vx*/ @@ -54918,6 +55291,462 @@ __kernel void resize_bilinear_U8toU8_SAME_8x_upsample_half_pixel_centers\n\ }\n\ "; /* end of resize_bilinear_U8_half_pixel_centers_2_vx*/ +static const char resize_bilinear_U8_half_pixel_centers_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8PostProcess_2x8;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;\n\ +_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;\n\ +_viv_uniform int out_height;\n\ +\n\ +__kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\ + coord_in.x = (coord_out.x * 2 - 1) >> 2;\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;\n\ +\n\ + vxc_uchar16 in0, in1, tmp, result;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + vxc_ushort8 multiplier;\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ +\n\ + vxc_ushort8 dst0;\n\ + while (coord_out.y < out_height)\n\ + {\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ + VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y += 2;\n\ + coord_out.y++;\n\ + }\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;\n\ +_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;\n\ +_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;\n\ +_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;\n\ +__kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\ + coord_in.x = (coord_out.x * 2 - 3) >> 3;\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;\n\ +\n\ + vxc_uchar16 in0, in1, dst0, dst1;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + vxc_ushort8 multiplier;\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ +\n\ + vxc_ushort8 tmp;\n\ + while (coord_out.y < out_height)\n\ + {\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y += 2;\n\ + coord_out.y++;\n\ + }\n\ +}\n\ +"; /* end of resize_bilinear_U8_half_pixel_centers_3_vx*/ + +static const char resize_bilinear_U8_half_pixel_centers_4_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8PostProcess_2x8;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;\n\ +__kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;\n\ + coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;\n\ + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;\n\ +\n\ + vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + vxc_ushort8 multiplier;\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ +\n\ + vxc_ushort8 data;\n\ +\n\ + VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);\n\ + VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ + VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ + VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ + VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ +\n\ + VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);\n\ + VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);\n\ + VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);\n\ + VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ + VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ + VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ + VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);\n\ + VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ + VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ + VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ + VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ +\n\ + VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);\n\ + VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);\n\ + VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);\n\ + VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ + VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ + VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ + VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of resize_bilinear_U8_half_pixel_centers_4_vx*/ + +static const char resize_bilinear_U8_half_pixel_centers_5_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8PostProcess_2x8;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform int out_height;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;\n\ +_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;\n\ +__kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\ + coord_in.x = (coord_out.x * 2 - 7) >> 4;\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;\n\ +\n\ + vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + vxc_ushort8 multiplier;\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16);\n\ +\n\ + vxc_ushort8 tmp;\n\ + while (coord_out.y < out_height)\n\ + {\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);\n\ + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\ + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);\n\ + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\ + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);\n\ + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\ + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);\n\ + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\ + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);\n\ + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\ + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);\n\ + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\ + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y += 2;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);\n\ + VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);\n\ + VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);\n\ + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);\n\ + VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);\n\ + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);\n\ + VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + }\n\ +}\n\ +"; /* end of resize_bilinear_U8_half_pixel_centers_5_vx*/ + static const char resize_bilinear_U8_opt_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ #if (VX_VERSION==2)\n\ @@ -56088,6 +56917,1186 @@ __kernel void resize_nearest_I16toI16_op\n\ }\n\ "; /* end of resize_nearest_vx*/ +static const char rope_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float scale0;\n\ +_viv_uniform float scale1;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int half_head_size;\n\ +_viv_uniform VXC_512Bits uniATimesB_0_4x4;\n\ +_viv_uniform VXC_512Bits uniATimesB_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +#define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \\\n\ +__kernel void rope_##name##_bnhs \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t cos_cache, \\\n\ + __read_only image2d_array_t sin_cache, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + int4 coord_out = coord_in; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + \\\n\ + src_type data0, data1; \\\n\ + src1_type cos, sin; \\\n\ + copy_type v0, v1; \\\n\ + dst_type dst; \\\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, cos, v0, 16); \\\n\ + VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, sin, v1, 16); \\\n\ + coord_in.y += half_head_size; \\\n\ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + float4 data2, data3, data4, data5; \\\n\ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\ + VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\ + VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\ + data3 = data3 * scale0 - data5 * scale1 + output_zp; \\\n\ + \\\n\ + int4 dst0 = convert_int4_rte(data2); \\\n\ + int4 dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\ + VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\ + data2 = data2 * scale1 + data4 * scale0 + output_zp; \\\n\ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\ + \\\n\ + dst0 = convert_int4_rte(data2); \\\n\ + dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + coord_out.y += half_head_size; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ +ROPE_BNHS_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)\n\ +ROPE_BNHS_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)\n\ +ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +ROPE_BNHS_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)\n\ +ROPE_BNHS_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)\n\ +\n\ +__kernel void rope_F16_F16toF16_bnhs\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_out = coord_in;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + vxc_short8 v0, v1, v2, v3, dst;\n\ + vxc_half8 data0, data1, cos, sin, dst2;\n\ + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, v0, 16);\n\ + VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, cos, v1, 16);\n\ + VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, sin, v2, 16);\n\ + coord_in.y += half_head_size;\n\ + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, v3, 16);\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 data2, data3, data4, data5;\n\ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\ + VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\ + VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\ + data2 = data2 - data4;\n\ + data3 = data3 - data5;\n\ +\n\ + half4 dst0;\n\ + half4 dst1;\n\ + _viv_asm(CONV_RTE, dst0, data2);\n\ + _viv_asm(CONV_RTE, dst1, data3);\n\ +\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst2, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\ + VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\ + data2 = data2 * scale1 + data4 * scale0 + output_zp;\n\ + data3 = data3 * scale1 + data5 * scale0 + output_zp;\n\ +\n\ + _viv_asm(CONV_RTE, dst0, data2);\n\ + _viv_asm(CONV_RTE, dst1, data3);\n\ +\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst2, 16);\n\ + coord_out.y += half_head_size;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform int in0_zp;\n\ +_viv_uniform int cos_zp;\n\ +_viv_uniform int sin_zp;\n\ +_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;\n\ +_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;\n\ +#define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \\\n\ +__kernel void rope_##name##_bnhs \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t cos_cache, \\\n\ + __read_only image2d_array_t sin_cache, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + int4 coord_out = coord_in; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + \\\n\ + dtype data0, data1, dst; \\\n\ + src1_type cos, sin; \\\n\ + copy_type v0, v1; \\\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, cos, v0, 16); \\\n\ + VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, sin, v1, 16); \\\n\ + coord_in.y += half_head_size; \\\n\ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + float4 l00, l01, cos0, cos1; \\\n\ + float4 l10, l11, sin0, sin1; \\\n\ + VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \\\n\ + float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \\\n\ + \\\n\ + int4 dst0 = convert_int4_rte(data2); \\\n\ + int4 dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, \\\n\ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \\\n\ + data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \\\n\ + \\\n\ + dst0 = convert_int4_rte(data2); \\\n\ + dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + coord_out.y += half_head_size; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, \\\n\ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +ROPE_ASYM_BNHS(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)\n\ +ROPE_ASYM_BNHS(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)\n\ +ROPE_ASYM_BNHS(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)\n\ +ROPE_ASYM_BNHS(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)\n\ +ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +__kernel void rope_BF16_BF16toBF16_bnhs\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_out = coord_in;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + vxc_ushort8 v0, v1, v2, v3, dst;\n\ + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.y += half_head_size;\n\ + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_short8 data;\n\ + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, src0, data, 16);\n\ + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src1, data, 16);\n\ + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, cos0, data, 16);\n\ + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, cos1, data, 16);\n\ + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, sin0, data, 16);\n\ + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, sin1, data, 16);\n\ + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, src2, data, 16);\n\ + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src3, data, 16);\n\ +\n\ + float4 data0 = src0 * cos0 - src2 * sin0;\n\ + float4 data1 = src1 * cos1 - src3 * sin1;\n\ +\n\ + _viv_asm(COPY, v0, data0, 16);\n\ + _viv_asm(COPY, v1, data1, 16);\n\ +\n\ + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + data0 = src0 * sin0 + src2 * cos0;\n\ + data1 = src1 * sin1 + src3 * cos1;\n\ +\n\ + _viv_asm(COPY, v0, data0, 16);\n\ + _viv_asm(COPY, v1, data1, 16);\n\ +\n\ + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + coord_out.y += half_head_size;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of rope_0_vx*/ + +static const char rope_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float scale0;\n\ +_viv_uniform float scale1;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int half_head_size;\n\ +_viv_uniform VXC_512Bits uniATimesB_0_4x4;\n\ +_viv_uniform VXC_512Bits uniATimesB_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +#define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \\\n\ +__kernel void rope_##name##_bnh1 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t cos_cache, \\\n\ + __read_only image2d_array_t sin_cache, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + src_type data0, data1; \\\n\ + src1_type cos, sin; \\\n\ + copy_type v0, v1; \\\n\ + VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, cos, v0, 16); \\\n\ + VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, sin, v1, 16); \\\n\ + coord.x += half_head_size; \\\n\ + VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 data2, data3, data4, data5; \\\n\ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\ + VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\ + VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\ + data3 = data3 * scale0 - data5 * scale1 + output_zp; \\\n\ + \\\n\ + int4 dst0 = convert_int4_rte(data2); \\\n\ + int4 dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + dst_type dst; \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\ + VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \\\n\ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \\\n\ + data2 = data2 * scale1 + data4 * scale0 + output_zp; \\\n\ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\ + \\\n\ + dst0 = convert_int4_rte(data2); \\\n\ + dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ +ROPE_BNH1_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)\n\ +ROPE_BNH1_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)\n\ +ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +ROPE_BNH1_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)\n\ +ROPE_BNH1_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)\n\ +\n\ +__kernel void rope_F16_F16toF16_bnh1\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 v0, v1, v2, v3, dst;\n\ + vxc_half8 data0, data1, cos, sin, dst2;\n\ + VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, v0, 16);\n\ + VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, cos, v1, 16);\n\ + VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, sin, v2, 16);\n\ + coord.x += half_head_size;\n\ + VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, v3, 16);\n\ +\n\ + float4 data2, data3, data4, data5;\n\ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\ + VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\ + VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\ + data2 = data2 - data4;\n\ + data3 = data3 - data5;\n\ +\n\ + half4 dst0;\n\ + half4 dst1;\n\ + _viv_asm(CONV_RTE, dst0, data2);\n\ + _viv_asm(CONV_RTE, dst1, data3);\n\ +\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst2, 16);\n\ + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\ + VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);\n\ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);\n\ + data2 = data2 + data4;\n\ + data3 = data3 + data5;\n\ +\n\ + _viv_asm(CONV_RTE, dst0, data2);\n\ + _viv_asm(CONV_RTE, dst1, data3);\n\ +\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst2, 16);\n\ + VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform int in0_zp;\n\ +_viv_uniform int cos_zp;\n\ +_viv_uniform int sin_zp;\n\ +_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;\n\ +_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;\n\ +#define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \\\n\ +__kernel void rope_##name##_bnh1 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t cos_cache, \\\n\ + __read_only image2d_array_t sin_cache, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + dtype data0, data1, dst; \\\n\ + src1_type cos, sin; \\\n\ + copy_type v0, v1; \\\n\ + VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, cos, v0, 16); \\\n\ + VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, sin, v1, 16); \\\n\ + coord.x += half_head_size; \\\n\ + VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 l00, l01, cos0, cos1; \\\n\ + float4 l10, l11, sin0, sin1; \\\n\ + VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \\\n\ + float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \\\n\ + \\\n\ + int4 dst0 = convert_int4_rte(data2); \\\n\ + int4 dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \\\n\ + data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \\\n\ + \\\n\ + dst0 = convert_int4_rte(data2); \\\n\ + dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +ROPE_ASYM_BNH1(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)\n\ +ROPE_ASYM_BNH1(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)\n\ +ROPE_ASYM_BNH1(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)\n\ +ROPE_ASYM_BNH1(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)\n\ +ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +__kernel void rope_BF16_BF16toBF16_bnh1\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_ushort8 v0, v1, v2, v3, dst;\n\ + VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += half_head_size;\n\ + VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_short8 data;\n\ + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, src0, data, 16);\n\ + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src1, data, 16);\n\ + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, cos0, data, 16);\n\ + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, cos1, data, 16);\n\ + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, sin0, data, 16);\n\ + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, sin1, data, 16);\n\ + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, src2, data, 16);\n\ + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src3, data, 16);\n\ +\n\ + float4 data0 = src0 * cos0 - src2 * sin0;\n\ + float4 data1 = src1 * cos1 - src3 * sin1;\n\ +\n\ + _viv_asm(COPY, v0, data0, 16);\n\ + _viv_asm(COPY, v1, data1, 16);\n\ +\n\ + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + data0 = src0 * sin0 + src2 * cos0;\n\ + data1 = src1 * sin1 + src3 * cos1;\n\ +\n\ + _viv_asm(COPY, v0, data0, 16);\n\ + _viv_asm(COPY, v1, data1, 16);\n\ +\n\ + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of rope_1_vx*/ + +static const char rope_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float scale0;\n\ +_viv_uniform float scale1;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;\n\ +_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;\n\ +_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;\n\ +_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +#define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \\\n\ +__kernel void rope_##name##_bsnh \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t cos_cache, \\\n\ + __read_only image2d_array_t sin_cache, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + \\\n\ + src_type data0, data1; \\\n\ + src1_type cos, sin; \\\n\ + copy_type v0, v1; \\\n\ + dst_type dst; \\\n\ + VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, cos, v0, 16); \\\n\ + VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, sin, v1, 16); \\\n\ + \\\n\ + coord_in.x *= 2; \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 coord_out = coord_in; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + float4 data2, data3, data4, data5; \\\n\ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \\\n\ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \\\n\ + VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \\\n\ + VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \\\n\ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\ + \\\n\ + int4 dst0 = convert_int4_rte(data2); \\\n\ + int4 dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \\\n\ + VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \\\n\ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \\\n\ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \\\n\ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\ + \\\n\ + dst0 = convert_int4_rte(data2); \\\n\ + dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + coord_out.x += 8; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ +ROPE_BSNH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)\n\ +ROPE_BSNH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)\n\ +ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +ROPE_BSNH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)\n\ +ROPE_BSNH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)\n\ +\n\ +__kernel void rope_F16_F16toF16_bsnh\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ +\n\ + vxc_short8 v0, v1, v2, v3, dst;\n\ + vxc_half8 data0, data1, cos, sin, dst2;\n\ + VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, cos, v1, 16);\n\ + VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, sin, v2, 16);\n\ +\n\ + coord_in.x *= 2;\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, v0, 16);\n\ + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, v3, 16);\n\ +\n\ + int4 coord_out = coord_in;\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 data2, data3, data4, data5;\n\ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);\n\ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);\n\ + VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);\n\ + VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);\n\ + data2 = data2 - data4;\n\ + data3 = data3 + data5;\n\ +\n\ + half4 dst0;\n\ + half4 dst1;\n\ + _viv_asm(CONV_RTE, dst0, data2);\n\ + _viv_asm(CONV_RTE, dst1, data3);\n\ +\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst2, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);\n\ + VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);\n\ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);\n\ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);\n\ + data2 = data2 - data4;\n\ + data3 = data3 + data5;\n\ +\n\ + _viv_asm(CONV_RTE, dst0, data2);\n\ + _viv_asm(CONV_RTE, dst1, data3);\n\ +\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst2, 16);\n\ + coord_out.x += 8;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform int in0_zp;\n\ +_viv_uniform int cos_zp;\n\ +_viv_uniform int sin_zp;\n\ +_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;\n\ +_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;\n\ +_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;\n\ +_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;\n\ +#define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \\\n\ +__kernel void rope_##name##_bsnh \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t cos_cache, \\\n\ + __read_only image2d_array_t sin_cache, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + \\\n\ + dtype data0, data1, dst; \\\n\ + src1_type cos, sin; \\\n\ + copy_type v0, v1; \\\n\ + \\\n\ + VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, cos, v0, 16); \\\n\ + VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, sin, v1, 16); \\\n\ + coord_in.x *= 2; \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 coord_out = coord_in; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + float4 l00, l01, cos0, cos1; \\\n\ + float4 l10, l11, sin0, sin1; \\\n\ + VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \\\n\ + VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \\\n\ + VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \\\n\ + float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \\\n\ + \\\n\ + int4 dst0 = convert_int4_rte(data2); \\\n\ + int4 dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, \\\n\ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \\\n\ + VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \\\n\ + data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \\\n\ + data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \\\n\ + \\\n\ + dst0 = convert_int4_rte(data2); \\\n\ + dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + coord_out.x += 8; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, \\\n\ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +ROPE_ASYM_BSNH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)\n\ +ROPE_ASYM_BSNH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)\n\ +ROPE_ASYM_BSNH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)\n\ +ROPE_ASYM_BSNH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)\n\ +ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +__kernel void rope_BF16_BF16toBF16_bsnh\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ +\n\ + vxc_ushort8 v0, v1, v2, v3, dst;\n\ + VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x *= 2;\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int4 coord_out = coord_in;\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_short8 data;\n\ + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, src0, data, 16);\n\ + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src1, data, 16);\n\ + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, cos0, data, 16);\n\ + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, cos1, data, 16);\n\ + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, sin0, data, 16);\n\ + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, sin1, data, 16);\n\ + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, src2, data, 16);\n\ + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src3, data, 16);\n\ +\n\ + float4 even = (float4)(src0.xz, src1.xz);\n\ + float4 odd = (float4)(src0.yw, src1.yw);\n\ + float4 data0 = even * cos0 - odd * sin0;\n\ + float4 data1 = even * sin0 + odd * cos0;\n\ +\n\ + _viv_asm(COPY, v0, data0, 16);\n\ + _viv_asm(COPY, v1, data1, 16);\n\ +\n\ + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + even = (float4)(src2.xz, src3.xz);\n\ + odd = (float4)(src2.yw, src3.yw);\n\ + data0 = even * cos1 - odd * sin1;\n\ + data1 = even * sin1 + odd * cos1;\n\ +\n\ + _viv_asm(COPY, v0, data0, 16);\n\ + _viv_asm(COPY, v1, data1, 16);\n\ +\n\ + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + coord_out.x += 8;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of rope_2_vx*/ + +static const char rope_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float scale0;\n\ +_viv_uniform float scale1;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;\n\ +_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;\n\ +_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;\n\ +_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +#define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \\\n\ +__kernel void rope_##name##_bnsh \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t cos_cache, \\\n\ + __read_only image2d_array_t sin_cache, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + \\\n\ + src_type data0, data1; \\\n\ + src1_type cos, sin; \\\n\ + copy_type v0, v1; \\\n\ + dst_type dst; \\\n\ + VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, cos, v0, 16); \\\n\ + VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, sin, v1, 16); \\\n\ + \\\n\ + coord_in.x *= 2; \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 coord_out = coord_in; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + float4 data2, data3, data4, data5; \\\n\ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \\\n\ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \\\n\ + VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \\\n\ + VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \\\n\ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\ + \\\n\ + int4 dst0 = convert_int4_rte(data2); \\\n\ + int4 dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \\\n\ + VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \\\n\ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \\\n\ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \\\n\ + data2 = data2 * scale0 - data4 * scale1 + output_zp; \\\n\ + data3 = data3 * scale1 + data5 * scale0 + output_zp; \\\n\ + \\\n\ + dst0 = convert_int4_rte(data2); \\\n\ + dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + coord_out.x += 8; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ +ROPE_BNSH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)\n\ +ROPE_BNSH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)\n\ +ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +ROPE_BNSH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)\n\ +ROPE_BNSH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)\n\ +\n\ +__kernel void rope_F16_F16toF16_bnsh\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ +\n\ + vxc_short8 v0, v1, v2, v3, dst;\n\ + vxc_half8 data0, data1, cos, sin, dst2;\n\ + VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, cos, v1, 16);\n\ + VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, sin, v2, 16);\n\ +\n\ + coord_in.x *= 2;\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ +\n\ + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data0, v0, 16);\n\ + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data1, v3, 16);\n\ +\n\ + int4 coord_out = coord_in;\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 data2, data3, data4, data5;\n\ + VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);\n\ + VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);\n\ + VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);\n\ + VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);\n\ + data2 = data2 - data4;\n\ + data3 = data3 + data5;\n\ +\n\ + half4 dst0;\n\ + half4 dst1;\n\ + _viv_asm(CONV_RTE, dst0, data2);\n\ + _viv_asm(CONV_RTE, dst1, data3);\n\ +\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst2, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);\n\ + VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);\n\ + VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);\n\ + VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);\n\ + data2 = data2 - data4;\n\ + data3 = data3 + data5;\n\ +\n\ + _viv_asm(CONV_RTE, dst0, data2);\n\ + _viv_asm(CONV_RTE, dst1, data3);\n\ +\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ + _viv_asm(COPY, dst, dst2, 16);\n\ + coord_out.x += 8;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform int in0_zp;\n\ +_viv_uniform int cos_zp;\n\ +_viv_uniform int sin_zp;\n\ +_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;\n\ +_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;\n\ +_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;\n\ +_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;\n\ +#define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \\\n\ +__kernel void rope_##name##_bnsh \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t cos_cache, \\\n\ + __read_only image2d_array_t sin_cache, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + \\\n\ + dtype data0, data1, dst; \\\n\ + src1_type cos, sin; \\\n\ + copy_type v0, v1; \\\n\ + \\\n\ + VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, cos, v0, 16); \\\n\ + VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, sin, v1, 16); \\\n\ + coord_in.x *= 2; \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 coord_out = coord_in; \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + \\\n\ + float4 l00, l01, cos0, cos1; \\\n\ + float4 l10, l11, sin0, sin1; \\\n\ + VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \\\n\ + VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \\\n\ + VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \\\n\ + VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \\\n\ + float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \\\n\ + float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \\\n\ + \\\n\ + int4 dst0 = convert_int4_rte(data2); \\\n\ + int4 dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, \\\n\ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \\\n\ + VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \\\n\ + data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \\\n\ + data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \\\n\ + \\\n\ + dst0 = convert_int4_rte(data2); \\\n\ + dst1 = convert_int4_rte(data3); \\\n\ + \\\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + coord_out.x += 8; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, \\\n\ + coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +ROPE_ASYM_BNSH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)\n\ +ROPE_ASYM_BNSH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)\n\ +ROPE_ASYM_BNSH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)\n\ +ROPE_ASYM_BNSH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)\n\ +ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +__kernel void rope_BF16_BF16toBF16_bnsh\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ +\n\ + vxc_ushort8 v0, v1, v2, v3, dst;\n\ + VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x *= 2;\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int4 coord_out = coord_in;\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_short8 data;\n\ + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, src0, data, 16);\n\ + VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src1, data, 16);\n\ + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, cos0, data, 16);\n\ + VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, cos1, data, 16);\n\ + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, sin0, data, 16);\n\ + VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, sin1, data, 16);\n\ + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, src2, data, 16);\n\ + VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, src3, data, 16);\n\ +\n\ + float4 even = (float4)(src0.xz, src1.xz);\n\ + float4 odd = (float4)(src0.yw, src1.yw);\n\ + float4 data0 = even * cos0 - odd * sin0;\n\ + float4 data1 = even * sin0 + odd * cos0;\n\ +\n\ + _viv_asm(COPY, v0, data0, 16);\n\ + _viv_asm(COPY, v1, data1, 16);\n\ +\n\ + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + even = (float4)(src2.xz, src3.xz);\n\ + odd = (float4)(src2.yw, src3.yw);\n\ + data0 = even * cos1 - odd * sin1;\n\ + data1 = even * sin1 + odd * cos1;\n\ +\n\ + _viv_asm(COPY, v0, data0, 16);\n\ + _viv_asm(COPY, v1, data1, 16);\n\ +\n\ + VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + coord_out.x += 8;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of rope_3_vx*/ + static const char scatter_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniAccumulateSum_2x8;\n\ @@ -57985,6 +59994,104 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \\\n\ }\n\ SCATTER_ND_UPDATE_COPY2OUT(U8, vxc_uchar16, 1)\n\ SCATTER_ND_UPDATE_COPY2OUT(I8, vxc_char16, 1)\n\ +SCATTER_ND_UPDATE_COPY2OUT(U16, vxc_ushort8, 2)\n\ +SCATTER_ND_UPDATE_COPY2OUT(I16, vxc_short8, 2)\n\ +\n\ +#define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \\\n\ +__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \\\n\ + __read_only image2d_t input_ref, \\\n\ + image2d_t temp_ref, \\\n\ + image2d_t output0 \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + Image img0 = create_image_from_image2d(input_ref, 2); \\\n\ + Image img1 = create_image_from_image2d(temp_ref, 2); \\\n\ + __global data_type* in_ptr = (__global data_type*)img0.ptr; \\\n\ + __global data_type* out_ptr = (__global data_type*)img1.ptr; \\\n\ + data_type src, dst; \\\n\ + src = in_ptr[gidx]; \\\n\ + vxc_ushort8 mp0; \\\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift0_Lo_2x8); \\\n\ + out_ptr[gidx] = dst; \\\n\ +}\n\ +SCATTER_ND_UPDATE_REF2OUT_16BITS(U16, vxc_ushort8)\n\ +SCATTER_ND_UPDATE_REF2OUT_16BITS(I16, vxc_short8)\n\ +\n\ +#define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \\\n\ +__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \\\n\ + __read_only image2d_t input_index, \\\n\ + __read_only image2d_t input_update, \\\n\ + image2d_t temp_ref, \\\n\ + image2d_t input0, \\\n\ + image2d_t output1, \\\n\ + int width, int area, int vol, int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(input_index, 4); \\\n\ + Image img2 = create_image_from_image2d(input_update, 2); \\\n\ + Image img3 = create_image_from_image2d(temp_ref, 2); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global data_type* update_ptr = (__global data_type*)img2.ptr; \\\n\ + __global data_type* output_ptr = (__global data_type*)img3.ptr; \\\n\ + data_type dst; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \\\n\ + data_type src = update_ptr[gidy * update_width + gidx]; \\\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \\\n\ + int loc = idx * output_width + gidx; \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift1_Lo_2x8); \\\n\ + output_ptr[loc] = dst; \\\n\ +}\n\ +SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16, vxc_ushort8)\n\ +SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16, vxc_short8)\n\ +\n\ +__kernel void scatter_nd_update_ref2out_F16toF16(\n\ + __read_only image2d_t input_ref,\n\ + image2d_t temp_ref,\n\ + image2d_t output0\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + Image img0 = create_image_from_image2d(input_ref, 2);\n\ + Image img1 = create_image_from_image2d(temp_ref, 2);\n\ + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr;\n\ + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr;\n\ + out_ptr[gidx] = in_ptr[gidx];\n\ +}\n\ +\n\ +__kernel void scatter_nd_update_update2ref_F16toF16_16x(\n\ + __read_only image2d_t input_index,\n\ + __read_only image2d_t input_update,\n\ + image2d_t temp_ref,\n\ + image2d_t input0,\n\ + image2d_t output1,\n\ + int width, int area, int vol, int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ +\n\ + Image img1 = create_image_from_image2d(input_index, 4);\n\ + Image img2 = create_image_from_image2d(input_update, 2);\n\ + Image img3 = create_image_from_image2d(temp_ref, 2);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + __global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr;\n\ + __global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr;\n\ +\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx);\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;\n\ + int loc = idx * output_width + gidx;\n\ + output_ptr[loc] = update_ptr[gidy * update_width + gidx];\n\ +}\n\ "; /* end of scatter_nd_update_special_vx*/ static const char select_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -63813,8 +65920,8 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\ }\n\ }\n\ \n\ -#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \\\n\ -__kernel void cumsum_##name##toU8_axis2( \\\n\ +#define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\ +__kernel void cumsum_##name##_axis2( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, \\\n\ @@ -63833,19 +65940,19 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\ int4 coord_out = coord; \\\n\ \\\n\ src_type sum = (src_type)(0); \\\n\ - uint4 dst = (uint4)(0); \\\n\ + dst_type dst = (dst_type)(0); \\\n\ int tmp_zp = convert_int_rte(output_zp); \\\n\ - dst.x = convert_uint_sat(tmp_zp); \\\n\ + dst.x = convert_dtype(tmp_zp); \\\n\ \\\n\ float cnt = 0.0f; \\\n\ \\\n\ if(exclusive && rev) \\\n\ { \\\n\ coord_out.z = channel - 1; \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + image_write(output, coord_out, dst); \\\n\ for(coord.z = channel - 1; coord.z > 0; coord.z--) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ coord_out.z--; \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ @@ -63853,17 +65960,17 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord_out, dst); \\\n\ } \\\n\ } \\\n\ else if(exclusive) \\\n\ { \\\n\ coord_out.z = 0; \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + image_write(output, coord_out, dst); \\\n\ for(coord.z = 0; coord.z < channel - 1; coord.z++) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ coord_out.z++; \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ @@ -63871,45 +65978,44 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord_out, dst); \\\n\ } \\\n\ } \\\n\ else if(rev) \\\n\ { \\\n\ for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord, dst); \\\n\ } \\\n\ } \\\n\ else \\\n\ { \\\n\ for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord, dst); \\\n\ } \\\n\ } \\\n\ }\n\ -CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)\n\ -CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)\n\ -\n\ -\n\ +CUMSUM_toINT_AXIS2_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_toINT_AXIS2_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_toINT_AXIS2_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)\n\ \n\ __kernel void cumsum_F32toF32_axis1(\n\ __read_only image2d_array_t input,\n\ @@ -63979,10 +66085,10 @@ __kernel void cumsum_F32toF32_axis1(\n\ }\n\ }\n\ \n\ -#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \\\n\ -__kernel void cumsum_##name##toU8_axis1( \\\n\ - __read_only image2d_array_t input, \\\n\ - __write_only image2d_array_t output, \\\n\ +#define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\ +__kernel void cumsum_##name##_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ int axis, \\\n\ int exclusive, \\\n\ int rev, \\\n\ @@ -63999,20 +66105,20 @@ __kernel void cumsum_##name##toU8_axis1( \\\n\ int4 coord_out = coord; \\\n\ \\\n\ src_type sum = (src_type)(0); \\\n\ - uint4 dst = (uint4)(0); \\\n\ + dst_type dst = (dst_type)(0); \\\n\ int tmp_zp = convert_int_rte(output_zp); \\\n\ - dst.x = convert_uint_sat(tmp_zp); \\\n\ + dst.x = convert_dtype(tmp_zp); \\\n\ \\\n\ float cnt = 0; \\\n\ \\\n\ if(exclusive && rev) \\\n\ { \\\n\ coord_out.y = height - 1; \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + image_write(output, coord_out, dst); \\\n\ \\\n\ for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ cnt += 1.0f; \\\n\ coord_out.y--; \\\n\ sum += data; \\\n\ @@ -64020,17 +66126,17 @@ __kernel void cumsum_##name##toU8_axis1( \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord_out, dst); \\\n\ } \\\n\ } \\\n\ else if(exclusive) \\\n\ { \\\n\ coord_out.y = 0; \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + image_write(output, coord_out, dst); \\\n\ for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ cnt += 1.0f; \\\n\ coord_out.y++; \\\n\ sum += data; \\\n\ @@ -64038,44 +66144,44 @@ __kernel void cumsum_##name##toU8_axis1( \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord_out, dst); \\\n\ } \\\n\ } \\\n\ else if(rev) \\\n\ { \\\n\ for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord, dst); \\\n\ } \\\n\ } \\\n\ else \\\n\ { \\\n\ for(coord.y = 0; coord.y < height; coord.y++) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord, dst); \\\n\ } \\\n\ } \\\n\ }\n\ -CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)\n\ -CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)\n\ -\n\ +CUMSUM_toINT_AXIS1_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_toINT_AXIS1_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_toINT_AXIS1_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)\n\ \n\ __kernel void cumsum_F32toF32_axis0(\n\ __read_only image2d_array_t input,\n\ @@ -64145,8 +66251,8 @@ __kernel void cumsum_F32toF32_axis0(\n\ }\n\ }\n\ \n\ -#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \\\n\ -__kernel void cumsum_##name##toU8_axis0( \\\n\ +#define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\ +__kernel void cumsum_##name##_axis0( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, \\\n\ @@ -64165,19 +66271,19 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\ int4 coord_out = coord; \\\n\ \\\n\ src_type sum = (src_type)(0); \\\n\ - uint4 dst = (uint4)(0); \\\n\ + dst_type dst = (dst_type)(0); \\\n\ int tmp_zp = convert_int_rte(output_zp); \\\n\ - dst.x = convert_uint_sat(tmp_zp); \\\n\ + dst.x = convert_dtype(tmp_zp); \\\n\ \\\n\ float cnt = 0; \\\n\ \\\n\ if(exclusive && rev) \\\n\ { \\\n\ coord_out.x = width - 1; \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + image_write(output, coord_out, dst); \\\n\ for(coord.x = width - 1; coord.x > 0; coord.x--) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ coord_out.x--; \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ @@ -64185,8 +66291,8 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord_out, dst); \\\n\ } \\\n\ } \\\n\ else if(exclusive) \\\n\ @@ -64195,7 +66301,7 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\ write_imageui(output, coord_out, dst); \\\n\ for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ coord_out.x++; \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ @@ -64203,43 +66309,45 @@ __kernel void cumsum_##name##toU8_axis0( \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord_out, dst); \\\n\ } \\\n\ } \\\n\ else if(rev) \\\n\ { \\\n\ for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord, dst); \\\n\ } \\\n\ } \\\n\ else \\\n\ { \\\n\ for(coord.x = 0; coord.x < width; coord.x++) \\\n\ { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ + src_type data = image_read(input, coord); \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ \\\n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord, dst); \\\n\ } \\\n\ } \\\n\ }\n\ -CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)\n\ -CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)\n\ +CUMSUM_toINT_AXIS0_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_toINT_AXIS0_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_toINT_AXIS0_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)\n\ +\n\ "; /* end of cumsum_cl*/ static const char cumsum_2d_cl[] = "\n\ @@ -64309,188 +66417,100 @@ __kernel void cumsum_F32toF32_axis1_2D(\n\ }\n\ }\n\ \n\ -__kernel void cumsum_U8toU8_axis1_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - int axis,\n\ - int exclusive,\n\ - int rev,\n\ - int width,\n\ - int height,\n\ - int chn,\n\ - int input_zp,\n\ - float in_out_scale,\n\ - float in_out_zp_scale,\n\ - float output_zp\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - uint4 sum = (uint4)(0);\n\ - uint4 dst = (uint4)(0);\n\ -\n\ - int tmp_zp = convert_int_rte(output_zp);\n\ - dst.x = convert_uint_sat(tmp_zp);\n\ -\n\ - float cnt = 0;\n\ -\n\ - if(exclusive && rev)\n\ - {\n\ - coord.w = height - 1;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ - {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ - coord.w--;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ - }\n\ - }\n\ - else if(exclusive)\n\ - {\n\ - write_imageui(output, coord.zw, dst);\n\ - for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ - {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ - coord.w++;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ - }\n\ - }\n\ - else if(rev)\n\ - {\n\ - for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ - {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ - }\n\ - }\n\ - else\n\ - {\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ - }\n\ - }\n\ -}\n\ -\n\ -__kernel void cumsum_F32toU8_axis1_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - int axis,\n\ - int exclusive,\n\ - int rev,\n\ - int width,\n\ - int height,\n\ - int chn,\n\ - int input_zp,\n\ - float in_out_scale,\n\ - float in_out_zp_scale,\n\ - float output_zp\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - float4 sum = (float4)(0);\n\ - uint4 dst = (uint4)(0);\n\ - int tmp_zp = convert_int_rte(output_zp);\n\ - dst.x = convert_uint_sat(tmp_zp);\n\ -\n\ - float cnt = 0;\n\ -\n\ - if(exclusive && rev)\n\ - {\n\ - coord.w = height - 1;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ - {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ - coord.w--;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ - }\n\ - }\n\ - else if(exclusive)\n\ - {\n\ - write_imageui(output, coord.zw, dst);\n\ - for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ - {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ - coord.w++;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ - }\n\ - }\n\ - else if(rev)\n\ - {\n\ - for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ - {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ - }\n\ - }\n\ - else\n\ - {\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ - }\n\ - }\n\ +#define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\ +__kernel void cumsum_##name##_axis1_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int exclusive, \\\n\ + int rev, \\\n\ + int width, \\\n\ + int height, \\\n\ + int chn, \\\n\ + int input_zp, \\\n\ + float in_out_scale, \\\n\ + float in_out_zp_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + src_type sum = (src_type)(0); \\\n\ + dst_type dst = (dst_type)(0); \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst.x = convert_dtype(tmp_zp); \\\n\ + \\\n\ + float cnt = 0; \\\n\ + \\\n\ + if(exclusive && rev) \\\n\ + { \\\n\ + coord.w = height - 1; \\\n\ + image_write(output, coord.zw, dst); \\\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\ + { \\\n\ + src_type data = image_read(input, coord.xy); \\\n\ + cnt += 1.0f; \\\n\ + coord.w--; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord.zw, dst); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive) \\\n\ + { \\\n\ + image_write(output, coord.zw, dst); \\\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\ + { \\\n\ + src_type data = image_read(input, coord.xy); \\\n\ + cnt += 1.0f; \\\n\ + coord.w++; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord.zw, dst); \\\n\ + } \\\n\ + } \\\n\ + else if(rev) \\\n\ + { \\\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ + { \\\n\ + src_type data = image_read(input, coord.xy); \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord.xy, dst); \\\n\ + } \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + src_type data = image_read(input, coord.xy); \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord.xy, dst); \\\n\ + } \\\n\ + } \\\n\ }\n\ +CUMSUM_INT_AXIS1_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_INT_AXIS1_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)\n\ \n\ __kernel void cumsum_F32toF32_axis0_2D(\n\ __read_only image2d_t input,\n\ @@ -64560,191 +66580,103 @@ __kernel void cumsum_F32toF32_axis0_2D(\n\ }\n\ }\n\ \n\ -__kernel void cumsum_U8toU8_axis0_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - int axis,\n\ - int exclusive,\n\ - int rev,\n\ - int width,\n\ - int height,\n\ - int chn,\n\ - int input_zp,\n\ - float in_out_scale,\n\ - float in_out_zp_scale,\n\ - float output_zp\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - uint4 sum = (uint4)(0);\n\ - uint4 dst = (uint4)(0);\n\ -\n\ - int tmp_zp = convert_int_rte(output_zp);\n\ - dst.x = convert_uint_sat(tmp_zp);\n\ -\n\ - float cnt = 0.0f;\n\ -\n\ - if(exclusive && rev)\n\ - {\n\ - coord.x = width - 1;\n\ - coord.z = coord.x;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(; coord.x > 0; coord.x--)\n\ - {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - coord.z--;\n\ - cnt += 1.0;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ - }\n\ - }\n\ - else if(exclusive)\n\ - {\n\ - coord.z = 0;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ - {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ - coord.z++;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ - }\n\ - }\n\ - else if(rev)\n\ - {\n\ - for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ - {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ - }\n\ - }\n\ - else\n\ - {\n\ - for(coord.x = 0; coord.x < width; coord.x++)\n\ - {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ - }\n\ - }\n\ -}\n\ -\n\ -__kernel void cumsum_F32toU8_axis0_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - int axis,\n\ - int exclusive,\n\ - int rev,\n\ - int width,\n\ - int height,\n\ - int chn,\n\ - int input_zp,\n\ - float in_out_scale,\n\ - float in_out_zp_scale,\n\ - float output_zp\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - float4 sum = (float4)(0);\n\ - uint4 dst = (uint4)(0);\n\ - int tmp_zp = convert_int_rte(output_zp);\n\ - dst.x = convert_uint_sat(tmp_zp);\n\ -\n\ - float cnt = 0.0f;\n\ - if(exclusive && rev)\n\ - {\n\ - coord.x = width - 1;\n\ - coord.z = coord.x;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(; coord.x > 0; coord.x--)\n\ - {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - coord.z--;\n\ - cnt += 1.0;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ - }\n\ - }\n\ - else if(exclusive)\n\ - {\n\ - coord.z = 0;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ - {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ - coord.z++;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ - }\n\ - }\n\ - else if(rev)\n\ - {\n\ - for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ - {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ - }\n\ - }\n\ - else\n\ - {\n\ - for(coord.x = 0; coord.x < width; coord.x++)\n\ - {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ - sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ - }\n\ - }\n\ +#define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \\\n\ +__kernel void cumsum_##name##_axis0_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, \\\n\ + int exclusive, \\\n\ + int rev, \\\n\ + int width, \\\n\ + int height, \\\n\ + int chn, \\\n\ + int input_zp, \\\n\ + float in_out_scale, \\\n\ + float in_out_zp_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + src_type sum = (src_type)(0); \\\n\ + dst_type dst = (dst_type)(0); \\\n\ + \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst.x = convert_dtype(tmp_zp); \\\n\ + \\\n\ + float cnt = 0.0f; \\\n\ + \\\n\ + if(exclusive && rev) \\\n\ + { \\\n\ + coord.x = width - 1; \\\n\ + coord.z = coord.x; \\\n\ + image_write(output, coord.zw, dst); \\\n\ + for(; coord.x > 0; coord.x--) \\\n\ + { \\\n\ + src_type data = image_read(input, coord.xy); \\\n\ + coord.z--; \\\n\ + cnt += 1.0; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord.zw, dst); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive) \\\n\ + { \\\n\ + coord.z = 0; \\\n\ + image_write(output, coord.zw, dst); \\\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\ + { \\\n\ + src_type data = image_read(input, coord.xy); \\\n\ + cnt += 1.0f; \\\n\ + coord.z++; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord.zw, dst); \\\n\ + } \\\n\ + } \\\n\ + else if(rev) \\\n\ + { \\\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\ + { \\\n\ + src_type data = image_read(input, coord.xy); \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord.xy, dst); \\\n\ + } \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + for(coord.x = 0; coord.x < width; coord.x++) \\\n\ + { \\\n\ + src_type data = image_read(input, coord.xy); \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = convert_dtype(tmpSum); \\\n\ + image_write(output, coord.xy, dst); \\\n\ + } \\\n\ + } \\\n\ }\n\ +CUMSUM_INT_AXIS0_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_INT_AXIS0_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)\n\ +CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)\n\ "; /* end of cumsum_2d_cl*/ static const char cumsum_array_2d_axis0_cl[] = "\n\ @@ -78995,7 +80927,33 @@ __kernel void one_hot_U8toU8\n\ coord.z ++;\n\ } while (coord.z < depth);\n\ }\n\ -"; /* end of one_hot_cl*/ +\n\ +__kernel void one_hot_I32toBF16\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + int depth,\n\ + uint on_value,\n\ + uint off_value,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + int4 src = read_imagei(input, coord.xy);\n\ +\n\ + int val = convert_int(convert_float(src.x) * inputScale - inputTail);\n\ + do\n\ + {\n\ + uint4 dst;\n\ + dst.x = val == coord.z ? on_value : off_value;\n\ +\n\ + write_imageui(output, coord.xzyw, dst.xxxx);\n\ +\n\ + coord.z ++;\n\ + } while (coord.z < depth);\n\ +}"; /* end of one_hot_cl*/ static const char poolwithargmax_cl[] = "\n\ #define POOLWITHARGMAX_PROCESS(data_type, read_fun, write_fun0, write_fun1) \\\n\ @@ -82788,6 +84746,381 @@ __kernel void roi_align_U8_U16toU8\n\ }\n\ }"; /* end of roi_align_cl*/ +static const char rope_0_cl[] = "__kernel void rope_F32_F32toF32_axis0\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float input_zp,\n\ + float cos_zp,\n\ + float sin_zp,\n\ + float scale0,\n\ + float scale1,\n\ + float output_zp,\n\ + int half_head_size,\n\ + int step\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + float4 cos, sin;\n\ +\n\ + READ_IMAGEF_2DARRAY(cos, cos_cache, coord);\n\ + READ_IMAGEF_2DARRAY(sin, sin_cache, coord);\n\ + coord.x = coord.x * step;\n\ + float4 src0 = read_imagef(input, coord);\n\ + int4 coord_out = coord;\n\ +\n\ + coord.x += half_head_size;\n\ + float4 src1 = read_imagef(input, coord);\n\ +\n\ + float4 dst0 = src0 * cos - src1 * sin;\n\ + float4 dst1 = src0 * sin + src1 * cos;\n\ +\n\ + write_imagef(output, coord_out, dst0);\n\ + coord_out.x += half_head_size;\n\ + write_imagef(output, coord_out, dst1);\n\ +}\n\ +\n\ +__kernel void rope_F32_F32toF32_axis1\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float input_zp,\n\ + float cos_zp,\n\ + float sin_zp,\n\ + float scale0,\n\ + float scale1,\n\ + float output_zp,\n\ + int half_head_size,\n\ + int step\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + float4 cos, sin;\n\ +\n\ + READ_IMAGEF_2DARRAY(cos, cos_cache, coord);\n\ + READ_IMAGEF_2DARRAY(sin, sin_cache, coord);\n\ + coord.y = coord.y * step;\n\ + float4 src0 = read_imagef(input, coord);\n\ + int4 coord_out = coord;\n\ + coord.y += half_head_size;\n\ + float4 src1 = read_imagef(input, coord);\n\ +\n\ + float4 dst0 = src0 * cos - src1 * sin;\n\ + float4 dst1 = src0 * sin + src1 * cos;\n\ +\n\ + write_imagef(output, coord_out, dst0);\n\ + coord_out.y += half_head_size;\n\ + write_imagef(output, coord_out, dst1);\n\ +}\n\ +\n\ +__kernel void rope_F32_F32toF32_axis2\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float input_zp,\n\ + float cos_zp,\n\ + float sin_zp,\n\ + float scale0,\n\ + float scale1,\n\ + float output_zp,\n\ + int half_head_size,\n\ + int step\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ +\n\ + float4 cos = read_imagef(cos_cache, coord);\n\ + float4 sin = read_imagef(sin_cache, coord);\n\ + coord.z = coord.z * step;\n\ + float4 src0 = read_imagef(input, coord);\n\ + int4 coord_out = coord;\n\ + coord.z += half_head_size;\n\ + float4 src1 = read_imagef(input, coord);\n\ +\n\ + float4 dst0 = src0 * cos - src1 * sin;\n\ + float4 dst1 = src0 * sin + src1 * cos;\n\ +\n\ + write_imagef(output, coord_out, dst0);\n\ + coord_out.z += half_head_size;\n\ + write_imagef(output, coord_out, dst1);\n\ +}\n\ +\n\ +__kernel void rope_I32_I32toI32_axis0\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float input_zp,\n\ + float cos_zp,\n\ + float sin_zp,\n\ + float scale0,\n\ + float scale1,\n\ + float output_zp,\n\ + int half_head_size,\n\ + int step\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 _cos, _sin;\n\ + float4 cos, sin;\n\ +\n\ + READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);\n\ + READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);\n\ + coord.x = coord.x * step;\n\ + float4 src0 = convert_float4(read_imagei(input, coord));\n\ + int4 coord_out = coord;\n\ +\n\ + coord.x += half_head_size;\n\ + float4 src1 = convert_float4(read_imagei(input, coord));\n\ +\n\ + src0 = src0 - input_zp;\n\ + src1 = src1 - input_zp;\n\ + cos = convert_float4(_cos) - cos_zp;\n\ + sin = convert_float4(_sin) - sin_zp;\n\ +\n\ + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\ + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\ + int4 dst0 = convert_int4_rte(_dst0);\n\ + int4 dst1 = convert_int4_rte(_dst1);\n\ +\n\ + write_imagei(output, coord_out, dst0);\n\ + coord_out.x += half_head_size;\n\ + write_imagei(output, coord_out, dst1);\n\ +}\n\ +\n\ +__kernel void rope_I32_I32toI32_axis1\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float input_zp,\n\ + float cos_zp,\n\ + float sin_zp,\n\ + float scale0,\n\ + float scale1,\n\ + float output_zp,\n\ + int half_head_size,\n\ + int step\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 _cos, _sin;\n\ + float4 cos, sin;\n\ +\n\ + READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);\n\ + READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);\n\ + coord.y = coord.y * step;\n\ + float4 src0 = convert_float4(read_imagei(input, coord));\n\ + int4 coord_out = coord;\n\ +\n\ + coord.y += half_head_size;\n\ + float4 src1 = convert_float4(read_imagei(input, coord));\n\ +\n\ + src0 = src0 - input_zp;\n\ + src1 = src1 - input_zp;\n\ + cos = convert_float4(_cos) - cos_zp;\n\ + sin = convert_float4(_sin) - sin_zp;\n\ +\n\ + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\ + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\ + int4 dst0 = convert_int4_rte(_dst0);\n\ + int4 dst1 = convert_int4_rte(_dst1);\n\ +\n\ + write_imagei(output, coord_out, dst0);\n\ + coord_out.y += half_head_size;\n\ + write_imagei(output, coord_out, dst1);\n\ +}\n\ +\n\ +__kernel void rope_I32_I32toI32_axis2\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float input_zp,\n\ + float cos_zp,\n\ + float sin_zp,\n\ + float scale0,\n\ + float scale1,\n\ + float output_zp,\n\ + int half_head_size,\n\ + int step\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ +\n\ + float4 cos = convert_float4(read_imagei(cos_cache, coord));\n\ + float4 sin = convert_float4(read_imagei(sin_cache, coord));\n\ + coord.z = coord.z * step;\n\ + float4 src0 = convert_float4(read_imagei(input, coord));\n\ + int4 coord_out = coord;\n\ +\n\ + coord.z += half_head_size;\n\ + float4 src1 = convert_float4(read_imagei(input, coord));\n\ +\n\ + src0 = src0 - input_zp;\n\ + src1 = src1 - input_zp;\n\ + cos = cos - cos_zp;\n\ + sin = sin - sin_zp;\n\ +\n\ + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\ + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\ + int4 dst0 = convert_int4_rte(_dst0);\n\ + int4 dst1 = convert_int4_rte(_dst1);\n\ +\n\ + write_imagei(output, coord_out, dst0);\n\ + coord_out.z += half_head_size;\n\ + write_imagei(output, coord_out, dst1);\n\ +}\n\ +\n\ +__kernel void rope_U32_U32toU32_axis0\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float input_zp,\n\ + float cos_zp,\n\ + float sin_zp,\n\ + float scale0,\n\ + float scale1,\n\ + float output_zp,\n\ + int half_head_size,\n\ + int step\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + uint4 _cos, _sin;\n\ + float4 cos, sin;\n\ +\n\ + READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);\n\ + READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);\n\ + coord.x = coord.x * step;\n\ + float4 src0 = convert_float4(read_imageui(input, coord));\n\ + int4 coord_out = coord;\n\ +\n\ + coord.x += half_head_size;\n\ + float4 src1 = convert_float4(read_imageui(input, coord));\n\ +\n\ + src0 = src0 - input_zp;\n\ + src1 = src1 - input_zp;\n\ + cos = convert_float4(_cos) - cos_zp;\n\ + sin = convert_float4(_sin) - sin_zp;\n\ +\n\ + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\ + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\ + uint4 dst0 = convert_uint4_rte(_dst0);\n\ + uint4 dst1 = convert_uint4_rte(_dst1);\n\ +\n\ + write_imageui(output, coord_out, dst0);\n\ + coord_out.x += half_head_size;\n\ + write_imageui(output, coord_out, dst1);\n\ +}\n\ +\n\ +__kernel void rope_U32_U32toU32_axis1\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float input_zp,\n\ + float cos_zp,\n\ + float sin_zp,\n\ + float scale0,\n\ + float scale1,\n\ + float output_zp,\n\ + int half_head_size,\n\ + int step\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + uint4 _cos, _sin;\n\ + float4 cos, sin;\n\ +\n\ + READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);\n\ + READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);\n\ + coord.y = coord.y * step;\n\ + float4 src0 = convert_float4(read_imageui(input, coord));\n\ + int4 coord_out = coord;\n\ +\n\ + coord.y += half_head_size;\n\ + float4 src1 = convert_float4(read_imageui(input, coord));\n\ +\n\ + src0 = src0 - input_zp;\n\ + src1 = src1 - input_zp;\n\ + cos = convert_float4(_cos) - cos_zp;\n\ + sin = convert_float4(_sin) - sin_zp;\n\ +\n\ + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\ + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\ + uint4 dst0 = convert_uint4_rte(_dst0);\n\ + uint4 dst1 = convert_uint4_rte(_dst1);\n\ +\n\ + write_imageui(output, coord_out, dst0);\n\ + coord_out.y += half_head_size;\n\ + write_imageui(output, coord_out, dst1);\n\ +}\n\ +\n\ +__kernel void rope_U32_U32toU32_axis2\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t cos_cache,\n\ + __read_only image2d_array_t sin_cache,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + float input_zp,\n\ + float cos_zp,\n\ + float sin_zp,\n\ + float scale0,\n\ + float scale1,\n\ + float output_zp,\n\ + int half_head_size,\n\ + int step\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ +\n\ + float4 cos = convert_float4(read_imageui(cos_cache, coord));\n\ + float4 sin = convert_float4(read_imageui(sin_cache, coord));\n\ + coord.z = coord.z * step;\n\ + float4 src0 = convert_float4(read_imageui(input, coord));\n\ + int4 coord_out = coord;\n\ +\n\ + coord.z += half_head_size;\n\ + float4 src1 = convert_float4(read_imageui(input, coord));\n\ +\n\ + src0 = src0 - input_zp;\n\ + src1 = src1 - input_zp;\n\ + cos = cos - cos_zp;\n\ + sin = sin - sin_zp;\n\ +\n\ + float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;\n\ + float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;\n\ + uint4 dst0 = convert_uint4_rte(_dst0);\n\ + uint4 dst1 = convert_uint4_rte(_dst1);\n\ +\n\ + write_imageui(output, coord_out, dst0);\n\ + coord_out.z += half_head_size;\n\ + write_imageui(output, coord_out, dst1);\n\ +}\n\ +"; /* end of rope_0_cl*/ + static const char scatter_elements_cl[] = "\n\ #define SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(name, dtype) \\\n\ __kernel void scatter_elements_axis0_##name \\\n\ @@ -86589,6 +88922,7 @@ static const source_map_t evis_resource[] = {"cumsum_ex_rev_axis1_vx", cumsum_ex_rev_axis1_vx}, {"cumsum_ex_rev_axis2_vx", cumsum_ex_rev_axis2_vx}, {"cumsum_f16_u8_vx", cumsum_f16_u8_vx}, + {"custom_letterbox_vx", custom_letterbox_vx}, {"custom_softmax_vx", custom_softmax_vx}, {"custom_warp_affine_vx", custom_warp_affine_vx}, {"custom_warp_affine_2d_vx", custom_warp_affine_2d_vx}, @@ -86812,12 +89146,19 @@ static const source_map_t evis_resource[] = {"resize_bilinear_U8_vx", resize_bilinear_U8_vx}, {"resize_bilinear_U8_half_pixel_centers_1_vx", resize_bilinear_U8_half_pixel_centers_1_vx}, {"resize_bilinear_U8_half_pixel_centers_2_vx", resize_bilinear_U8_half_pixel_centers_2_vx}, + {"resize_bilinear_U8_half_pixel_centers_3_vx", resize_bilinear_U8_half_pixel_centers_3_vx}, + {"resize_bilinear_U8_half_pixel_centers_4_vx", resize_bilinear_U8_half_pixel_centers_4_vx}, + {"resize_bilinear_U8_half_pixel_centers_5_vx", resize_bilinear_U8_half_pixel_centers_5_vx}, {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx}, {"resize_bilinear_align_corners_vx", resize_bilinear_align_corners_vx}, {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx}, {"resize_bilinear_nhwc_bound_vx", resize_bilinear_nhwc_bound_vx}, {"resize_cubic_vx", resize_cubic_vx}, {"resize_nearest_vx", resize_nearest_vx}, + {"rope_0_vx", rope_0_vx}, + {"rope_1_vx", rope_1_vx}, + {"rope_2_vx", rope_2_vx}, + {"rope_3_vx", rope_3_vx}, {"scatter_nd_vx", scatter_nd_vx}, {"scatter_nd_big_vx", scatter_nd_big_vx}, {"scatter_nd_update_vx", scatter_nd_update_vx}, @@ -86987,6 +89328,7 @@ static const source_map_t cl_resource[] = {"resize_nearest_cl", resize_nearest_cl}, {"reversesequence_cl", reversesequence_cl}, {"roi_align_cl", roi_align_cl}, + {"rope_0_cl", rope_0_cl}, {"scatter_elements_cl", scatter_elements_cl}, {"scatter_elements_add_cl", scatter_elements_add_cl}, {"scatter_elements_mul_cl", scatter_elements_mul_cl}, diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c index 2c63c1e..5be282c 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c @@ -29,6 +29,7 @@ #include "VX/vx_ext_program.h" #include "vsi_nn_platform.h" #include "vsi_nn_prv.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_log.h" #include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" @@ -198,10 +199,11 @@ static vsi_status vsi_nn_RegisterVXKernel vx_size * program_len = NULL; const char **program_src = NULL; vx_context ctx = NULL; - vsi_nn_context_t context = NULL; vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index]; uint8_t i = 0; vsi_bool load_from_file = FALSE; + vsi_nn_runtime_option_t* options; + options = ((vsi_nn_graph_prv_t*)graph)->options; #define MAX_BUILDPROGRAM_LEN 128 char cmd[MAX_BUILDPROGRAM_LEN] = {0}; @@ -210,8 +212,7 @@ static vsi_status vsi_nn_RegisterVXKernel memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN); status = VSI_FAILURE; ctx = vxGetContext( (vx_reference)graph->g ); - context = graph->ctx; - evis = context->config.evis.ver; + evis = options->config.evis.ver; program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *)); CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final ); @@ -244,12 +245,12 @@ static vsi_status vsi_nn_RegisterVXKernel { // set default evis version is 2 snprintf(cmd, MAX_BUILDPROGRAM_LEN, - "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); + "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va); } else { snprintf(cmd, MAX_BUILDPROGRAM_LEN, - "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); + "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va); } status = vxBuildProgram(program, cmd); @@ -302,7 +303,7 @@ static vsi_status vsi_nn_RegisterBinKernel vx_size program_len = 0; const uint8_t *program_ptr = NULL; vx_context ctx; - vsi_nn_context_t context; + vsi_nn_runtime_option_t* options; vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index]; #define MAX_BUILDPROGRAM_LEN 128 @@ -313,8 +314,8 @@ static vsi_status vsi_nn_RegisterBinKernel status = VSI_FAILURE; ctx = vxGetContext( (vx_reference)graph->g ); - context = graph->ctx; - evis = context->config.evis.ver; + options = ((vsi_nn_graph_prv_t*)graph)->options; + evis = options->config.evis.ver; program_ptr = vsi_nn_VxBinResourceGetResource( kernel_info->resource_name[kernel_info->resource_num - 1], &program_len); @@ -337,12 +338,12 @@ static vsi_status vsi_nn_RegisterBinKernel { // set default evis version is 2 snprintf(cmd, MAX_BUILDPROGRAM_LEN, - "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); + "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va); } else { snprintf(cmd, MAX_BUILDPROGRAM_LEN, - "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); + "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va); } #else snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension"); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c index d1ca746..be08a5e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c @@ -35,6 +35,8 @@ #include "utils/vsi_nn_constraint_check.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" +#include "vsi_nn_tensor_util_prv.h" +#include "vsi_nn_error.h" static vsi_status _try_set_high_presision_tensor ( @@ -120,9 +122,22 @@ static vsi_status _static_batchnorm vsi_nn_tensor_t ** outputs ) { +#define _TENSOR_LEN 64 vsi_status status; vsi_nn_kernel_param_t * param = NULL; vsi_nn_tensor_t* reshape_tensors[6] = { NULL }; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM]; + uint32_t new_rank = 4; + vsi_nn_tensor_t* input0 = NULL; + vsi_nn_tensor_t* output = NULL; + char reshape0_tensor_name[_TENSOR_LEN]; + char reshape1_tensor_name[_TENSOR_LEN]; + char batch_norm_tensor_name[_TENSOR_LEN]; + + memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name)); + memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name)); + memset(batch_norm_tensor_name, 0, sizeof(batch_norm_tensor_name)); + status = VSI_FAILURE; status = _try_set_high_presision_tensor(inputs); @@ -131,10 +146,43 @@ static vsi_status _static_batchnorm VSILOGE("Set tensor attr of high presision fail"); return status; } - if(_require_reshape(self, inputs)) + if (_require_reshape(self, inputs)) { - reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input; - reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output; + if (3 == inputs[0]->attr.dim_num) + { + shape[0] = inputs[0]->attr.size[0]; + shape[1] = 1; + shape[2] = inputs[0]->attr.size[1]; + shape[3] = inputs[0]->attr.size[2]; + } + else if (5 == inputs[0]->attr.dim_num) + { + shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; + shape[1] = inputs[0]->attr.size[2]; + shape[2] = inputs[0]->attr.size[3]; + shape[3] = inputs[0]->attr.size[4]; + } + + input0 = vsi_nn_kernel_insert_reshape_node(self->graph, + inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD); + CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final); + reshape_tensors[0] = input0; + snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0); + if (vxSetReferenceName((vx_reference)reshape_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u reshape 0 node output name fail", self->uid); + goto final; + } + output = vsi_nn_kernel_insert_reshape_node(self->graph, + outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD); + CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final); + reshape_tensors[5] = output; + snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1); + if (vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u reshap 1 node output name fail", self->uid); + goto final; + } } else { @@ -155,12 +203,26 @@ static vsi_status _static_batchnorm reshape_tensors, 5, &reshape_tensors[5], 1, param ); - if( self->n ) + if ( self->n ) { status = VSI_SUCCESS; } - vsi_nn_kernel_param_release( ¶m ); + vsi_nn_kernel_param_release(¶m); + + if (output) + { + snprintf(batch_norm_tensor_name, sizeof(batch_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2); + if (vxSetReferenceName((vx_reference)output->t, batch_norm_tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u instance_norm node output name fail", self->uid); + goto final; + } + } + +final: + vsi_safe_release_tensor(input0); + vsi_safe_release_tensor(output); return status; } @@ -313,68 +375,6 @@ static vsi_status op_compute return status; } /* op_compute() */ -static vsi_status op_optimize - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_opt_direction_e direction - ) -{ - uint32_t dim = 0; - vsi_nn_batcnnorm_lcl_data *local = NULL; - vsi_size_t shape[VSI_NN_MAX_DIM_NUM]; - char tensor_name[128]; - - dim = inputs[0]->attr.dim_num; - if(_require_reshape(self, inputs) == FALSE) - { - return VSI_SUCCESS; - } - - VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); - /* - reshape 3d input (xcn) --> 4d input (whcn) - reshape 3d output(xcn) --> 4d output(whcn) - */ - dim = 4; - if (3 == inputs[0]->attr.dim_num) - { - shape[0] = inputs[0]->attr.size[0]; - shape[1] = 1; - shape[2] = inputs[0]->attr.size[1]; - shape[3] = inputs[0]->attr.size[2]; - } - else if (5 == inputs[0]->attr.dim_num) - { - shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; - shape[1] = inputs[0]->attr.size[2]; - shape[2] = inputs[0]->attr.size[3]; - shape[3] = inputs[0]->attr.size[4]; - } - local = self->nn_param.batch_norm.local; - if (VSI_NN_OPTIMIZE_BACKWARD == direction) - { - local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim); - } - else - { - local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim); - if(local->reshaped_output && local->reshaped_output->t) - { - memset(tensor_name, 0, sizeof(tensor_name)); - snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid); - if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE) - { - VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid); - return VSI_FAILURE; - } - } - } - - return VSI_SUCCESS; -} /* op_optimize() */ - static vsi_bool _dynamic_check ( vsi_nn_node_t * self, @@ -494,58 +494,6 @@ static vsi_bool op_check } } /* op_check() */ -static vsi_bool op_setup - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_nn_batcnnorm_lcl_data *local = NULL; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) - { - outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; - memcpy( outputs[0]->attr.size, inputs[0]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); - } - - if(_require_reshape(self, inputs)) - { - local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data)); - if(NULL == local) - { - return VSI_FAILURE; - } - memset(local, 0, sizeof(vsi_nn_batcnnorm_lcl_data)); - self->nn_param.batch_norm.local = local; - } - return TRUE; -} /* op_setup() */ - -static vsi_status op_deinit - ( - vsi_nn_node_t * self - ) -{ - vsi_nn_batch_norm_param *p = &(self->nn_param.batch_norm); - if(p->local) - { - if (p->local->reshaped_input) - { - vsi_nn_ReleaseTensor(&(p->local->reshaped_input)); - p->local->reshaped_input = NULL; - } - if (p->local->reshaped_output) - { - vsi_nn_ReleaseTensor(&(p->local->reshaped_output)); - p->local->reshaped_output = NULL; - } - vsi_nn_safe_free(p->local); - } - vsi_nn_op_common_deinit(self); - return VSI_SUCCESS; -} - #ifdef __cplusplus extern "C" { #endif @@ -555,10 +503,10 @@ DEF_OP_REG /* op_name */ BATCH_NORM, /* init */ NULL, /* compute */ op_compute, - /* deinit */ op_deinit, + /* deinit */ vsi_nn_op_common_deinit, /* check */ op_check, - /* setup */ op_setup, - /* optimize */ op_optimize, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, /* input_num */ 5, /* output_num */ 1 ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c index c47bd27..71513e1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c @@ -118,6 +118,7 @@ static vsi_bool op_setup if (outputs[0]->attr.dim_num == 0) { outputs[0]->attr.size[0] = 1; + outputs[0]->attr.dim_num = 1; vsi_nn_SetTensorIsScalar(outputs[0], TRUE); } else diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c index 43f8a8f..117a578 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c @@ -82,6 +82,7 @@ static vsi_bool op_check { BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1) IO_TYPE(D_U32, D_U32) + IO_TYPE(D_I32, D_I32) IO_TYPE(D_F32, D_F32) IO_TYPE(D_F16, D_F16) IO_TYPE(D_BF16, D_BF16) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index 11f0268..d9dbd88 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -253,6 +253,7 @@ static vsi_bool op_check IO_TYPE(D_BOOL8, D_I32) IO_TYPE(D_BOOL8, D_U16) IO_TYPE(D_BOOL8, D_U32) + IO_TYPE(D_BOOL8, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_BOOL8) IO_TYPE(D_I8|Q_ASYM, D_BOOL8) IO_TYPE(D_I8|Q_DFP, D_BOOL8) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c index a768b46..b6eb002 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -155,10 +155,10 @@ vsi_bool vsi_nn_op_eltwise_setup vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; - out_rank = inputs[0]->attr.dim_num; + out_rank = vsi_nn_get_tensor_dims(inputs[0]); for ( i = 1; i < self->input.num; i++) { - in2_rank = inputs[i]->attr.dim_num; + in2_rank = vsi_nn_get_tensor_dims(inputs[i]); out_rank = vsi_nn_max( out_rank, in2_rank ); } @@ -166,10 +166,10 @@ vsi_bool vsi_nn_op_eltwise_setup { vsi_size_t sz0, sz1; - sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1; + sz0 = i < vsi_nn_get_tensor_dims(inputs[0]) ? inputs[0]->attr.size[i] : 1; for ( j = 1; j < self->input.num; j++) { - sz1 = i < inputs[j]->attr.dim_num ? inputs[j]->attr.size[i] : 1; + sz1 = i < vsi_nn_get_tensor_dims(inputs[j]) ? inputs[j]->attr.size[i] : 1; sz0 = vsi_nn_max( sz0, sz1 ); if (sz0 != sz1 && sz0 != 1 && sz1 != 1) { @@ -187,11 +187,12 @@ vsi_bool vsi_nn_op_eltwise_setup { outputs[0]->attr.dim_num = out_rank; memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); - if (out_rank == 1 && - vsi_nn_GetTensorIsScalar(inputs[0]) && + if (vsi_nn_GetTensorIsScalar(inputs[0]) && vsi_nn_GetTensorIsScalar(inputs[1])) { vsi_nn_SetTensorIsScalar(outputs[0], TRUE); + outputs[0]->attr.size[0] = 1; + outputs[0]->attr.dim_num = 1; } } else diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c index a887591..3a8edea 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -199,6 +199,7 @@ static vsi_bool op_setup if (o_rank == 0) { outputs[0]->attr.size[0] = 1; + outputs[0]->attr.dim_num = 1; vsi_nn_SetTensorIsScalar(outputs[0], TRUE); } else diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c index 31f7abc..878384a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c @@ -306,6 +306,8 @@ static vsi_bool _op_check IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP) IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM) IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM) + IO_TYPE(D_U16|Q_ASYM, D_F32, D_F32, D_U16|Q_ASYM) + IO_TYPE(D_U16|Q_SYM, D_F32, D_F32, D_U16|Q_SYM) END_IO_TYPE_DECL(GROUP_NORM) if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c index da15699..bfa87f3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c @@ -25,6 +25,7 @@ #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_platform.h" #include "vsi_nn_log.h" #include "vsi_nn_graph.h" @@ -197,6 +198,7 @@ static vsi_bool op_setup_default vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL }; vsi_nn_internal_tensor_t * h_times_r = NULL; vsi_nn_tensor_attr_t attr; + vsi_nn_activation_e recurrent_activation = p->recurrent_activation; vsi_nn_internal_init_node_wksp( self ); @@ -230,7 +232,8 @@ static vsi_bool op_setup_default memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || - self->graph->ctx->config.support_stream_processor) + (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor && + recurrent_activation == VSI_NN_ACT_SIGMOID)) { attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c index 5dbe4a4..e2ad82c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c @@ -93,37 +93,15 @@ static vsi_bool op_check { BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1) IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_F16) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F16) - IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_U8|Q_ASYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_DFP) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_ASYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_ASYM) - IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_SYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_SYM) - IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_DFP) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_ASYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_ASYM) - IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_SYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_SYM) IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_I16|Q_ASYM) - IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_I16|Q_SYM) - IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_F16) - IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_F16) - IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_I8|Q_SYM) - IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_F16) - IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_F16) - IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F16) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_I16|Q_DFP) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c index d52eb7d..7382b1b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c @@ -25,6 +25,7 @@ #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_platform.h" #include "vsi_nn_graph.h" #include "vsi_nn_node.h" @@ -351,7 +352,7 @@ static vsi_bool op_setup } else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 && outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) || - self->graph->ctx->config.support_stream_processor ) + ((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ) { vsi_nn_internal_tensor_t* output_tensor = NULL; vsi_nn_internal_tensor_t* reshape_tensor = NULL; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index 46a389a..5c4502d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -106,7 +106,7 @@ static vsi_bool op_setup vsi_nn_internal_init_node_wksp( self ); - if ( axis != 0 && !self->graph->ctx->config.support_stream_processor) + if ( axis != 0 && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor) { vsi_nn_internal_tensor_t* mean_tensor = NULL; vsi_nn_internal_tensor_t* vari_tensor = NULL; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c index 22dfd66..af2d283 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c @@ -25,6 +25,7 @@ #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_platform.h" #include "vsi_nn_log.h" #include "vsi_nn_graph.h" @@ -139,7 +140,7 @@ static vsi_bool op_setup p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL; p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL; - if (self->graph->ctx->config.support_stream_processor) + if (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor) { p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c index 2e0e48b..a7ec872 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c @@ -100,6 +100,7 @@ static vsi_bool op_check IO_TYPE(D_I32, D_I16|Q_ASYM) IO_TYPE(D_I32, D_I16|Q_SYM) IO_TYPE(D_I32, D_I32) + IO_TYPE(D_I32, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) @@ -111,8 +112,10 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_BF16) IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_BF16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_BF16) IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) @@ -124,11 +127,14 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_BF16) IO_TYPE(D_I16|Q_ASYM, D_F32) IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_BF16) IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_BF16) IO_TYPE(D_BF16, D_BF16) END_IO_TYPE_DECL(ONE_HOT) if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num)) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c index 80acd79..60285f6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c @@ -36,6 +36,7 @@ #include "vsi_nn_tensor_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) @@ -50,33 +51,52 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; - param =vsi_nn_kernel_param_create(); + vsi_nn_tensor_t* reshape_tensor = NULL; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_nn_pre_process_rgb_param* p = NULL; - vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb.local.scale_x ); - vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb.local.scale_y ); - vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb.rect.left ); - vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb.rect.top ); - vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean ); - vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean ); - vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean ); - vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale ); - vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale ); - vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale ); - vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel ); - vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm ); - vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy ); - n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", inputs, 1, outputs, 1, param ); - if( n != NULL ) + memcpy(shape, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t)); + + shape[0] = shape[1] * shape[0]; + shape[1] = shape[2]; + shape[2] = 1; + + reshape_tensor = vsi_nn_reshape_tensor(self->graph, + inputs[0], shape, inputs[0]->attr.dim_num); + CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor failed", final); + + p = (vsi_nn_pre_process_rgb_param*)&(self->nn_param.pre_process_rgb); + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y ); + vsi_nn_kernel_param_add_int32( param, "left", p->rect.left ); + vsi_nn_kernel_param_add_int32( param, "top", p->rect.top ); + vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean ); + vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean ); + vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean ); + vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale ); + vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale ); + vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale ); + vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel ); + vsi_nn_kernel_param_add_int32( param, "enable_perm", p->local->enable_perm ); + vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", &reshape_tensor, 1, outputs, 1, param ); + if ( n != NULL ) { self->n = (vx_node)n; status = VSI_SUCCESS; } - if(param != NULL) + if (param != NULL) { vsi_nn_kernel_param_release( ¶m ); } +final: + vsi_safe_release_tensor(reshape_tensor); + return status; } /* op_compute() */ @@ -166,35 +186,57 @@ static vsi_bool op_setup } - self->nn_param.pre_process_rgb.local.enable_perm = FALSE; + p->local->enable_perm = FALSE; - if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE) + if (p->local->enable_perm == FALSE) { - p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]); - p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]); + p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]); + p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]); } else { - p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]); - p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]); + p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]); + p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]); } - p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); + p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15))); return TRUE; } /* op_setup() */ +static vsi_status op_init +( + vsi_nn_node_t* self +) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.pre_process_rgb.local = + (vsi_nn_pre_process_rgb_lcl_data*)malloc(sizeof(vsi_nn_pre_process_rgb_lcl_data)); + + if (NULL == self->nn_param.pre_process_rgb.local) + { + return VX_ERROR_NO_MEMORY; + } + + memset(self->nn_param.pre_process_rgb.local, 0, sizeof(vsi_nn_pre_process_rgb_lcl_data)); + + return status; +} /* op_init() */ + static vsi_status op_deinit ( vsi_nn_node_t * self ) { - if (self->nn_param.pre_process_rgb.local.local_tensor != NULL) + if (self->nn_param.pre_process_rgb.local->local_tensor != NULL) { - vxReleaseTensor(&self->nn_param.pre_process_rgb.local.local_tensor); - self->nn_param.pre_process_rgb.local.local_tensor = NULL; + vxReleaseTensor(&self->nn_param.pre_process_rgb.local->local_tensor); + self->nn_param.pre_process_rgb.local->local_tensor = NULL; } + vsi_nn_safe_free(self->nn_param.pre_process_rgb.local); + vsi_nn_op_common_deinit(self); return VSI_SUCCESS; @@ -208,7 +250,7 @@ extern "C" { DEF_OP_REG ( /* op_name */ PRE_PROCESS_RGB, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c index eacf99d..3f80fac 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c @@ -79,7 +79,10 @@ static vsi_status _prelu_op_compute vsi_status status = VSI_FAILURE; vsi_nn_prelu_param *prelu = &self->nn_param.prelu; vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 }; - vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_nn_tensor_t* input0 = NULL; + vsi_nn_tensor_t* input1 = NULL; + vsi_nn_tensor_t* output = NULL; vsi_bool one_rank = FALSE; vsi_bool is_per_channel_alpha = 0; vsi_size_t alpha_shape = 1; @@ -88,6 +91,7 @@ static vsi_status _prelu_op_compute uint32_t dims = outputs[0]->attr.dim_num; reshape_tensors[0] = inputs[0]; + reshape_tensors[2] = outputs[0]; one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape); for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) @@ -114,18 +118,23 @@ static vsi_status _prelu_op_compute dims = inputs[1]->attr.dim_num; } - reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + input1 = vsi_nn_reshape_tensor( self->graph, inputs[1], (vsi_size_t*)shapes, dims ); + CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final); + reshape_tensors[1] = input1; } else { memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t)); - reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + input1 = vsi_nn_reshape_tensor( self->graph, inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num ); + CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final); + reshape_tensors[1] = input1; } } else { + uint32_t rank = inputs[0]->attr.dim_num; dims = inputs[1]->attr.dim_num; memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t)); @@ -141,9 +150,32 @@ static vsi_status _prelu_op_compute shapes[1] = 1; dims = 2; } + else if (one_rank && inputs[1]->attr.is_const == TRUE && + alpha_shape == inputs[0]->attr.size[0] && + alpha_shape == inputs[1]->attr.size[0] && + rank < 3) + { + is_per_channel_alpha = TRUE; + shapes[0] = 1; + shapes[1] = 1; + shapes[2] = alpha_shape; + shapes[3] = rank > 1 ? inputs[0]->attr.size[1] : 1; + dims = 4; + input0 = vsi_nn_reshape_tensor(self->graph, inputs[0], (vsi_size_t*)shapes, dims); + CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final); + reshape_tensors[0] = input0; + output = vsi_nn_reshape_tensor(self->graph, outputs[0], (vsi_size_t*)shapes, dims); + CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final); + reshape_tensors[2] = output; + shapes[0] = alpha_shape; + shapes[1] = 1; + dims = 2; + } - reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + input1 = vsi_nn_reshape_tensor( self->graph, inputs[1], (vsi_size_t*)shapes, dims ); + CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final); + reshape_tensors[1] = input1; } // Add params @@ -153,15 +185,19 @@ static vsi_status _prelu_op_compute self->n = (vx_node)vsi_nn_kernel_selector( self->graph, kernel_name, &reshape_tensors[0], 2, - outputs, 1, param ); + &reshape_tensors[2], 1, param ); vsi_nn_kernel_param_release( ¶m ); - vsi_nn_ReleaseTensor( &reshape_tensors[1] ); - if( self->n ) + if ( self->n ) { status = VSI_SUCCESS; } +final: + vsi_safe_release_tensor(input0); + vsi_safe_release_tensor(input1); + vsi_safe_release_tensor(output); + return status; } /* _prelu_op_compute() */ @@ -211,28 +247,36 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(PRELU, 2, 1) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) - IO_TYPE(D_BF16, D_F16, D_BF16) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_BF16, D_F16, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) /* HW 9.0 */ - IO_TYPE(D_F32, D_BF16, D_BF16) - IO_TYPE(D_BF16, D_BF16, D_F32) + IO_TYPE(D_F32, D_BF16, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32) END_IO_TYPE_DECL(PRELU) - if(!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index 4c314b8..84dc0b6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -162,7 +162,7 @@ static vsi_bool _check_is_sp_supported_type int32_t * axes = self->nn_param.reduce.local2->axes; int32_t axes_num = self->nn_param.reduce.local2->axes_num; - if ( !self->graph->ctx->config.support_stream_processor || + if ( !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor || (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) ) { return FALSE; @@ -788,7 +788,7 @@ static vsi_bool op_set_reduce_axis( } *out_rank_x = inputs[0]->attr.dim_num; } - else if (!self->graph->ctx->config.support_stream_processor || + else if (!((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor || resolved_dim_count > 2) { optimzation_input_size( diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c index 662fa96..ce249e2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c @@ -61,7 +61,7 @@ static vsi_status op_compute vx_nn_reshape_params_t reshape_param; memset(&attr, 0, sizeof(attr)); - attr.size[0] = self->nn_param.reshape.dim_num; + attr.size[0] = vsi_nn_max(self->nn_param.reshape.dim_num, 1); attr.dim_num = 1; attr.is_const = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_INT32; @@ -124,17 +124,28 @@ static vsi_bool op_setup vsi_bool ret = TRUE; if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t i = 0; - for (i = 0; i < self->nn_param.reshape.dim_num; i++) + if (self->nn_param.reshape.dim_num == 0 || + self->nn_param.reshape.size == NULL + ) { - shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \ - (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i]; + outputs[0]->attr.size[0] = 1; + outputs[0]->attr.dim_num = 1; + vsi_nn_SetTensorIsScalar(outputs[0], TRUE); + } + else + { + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t i = 0; + for (i = 0; i < self->nn_param.reshape.dim_num; i++) + { + shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \ + (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i]; + } + ret = vsi_nn_CalcReshapeTensor(inputs[0], + outputs[0], + shape, + self->nn_param.reshape.dim_num); } - ret = vsi_nn_CalcReshapeTensor(inputs[0], - outputs[0], - shape, - self->nn_param.reshape.dim_num); } return ret; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c index 93d269d..dff517f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c @@ -66,7 +66,7 @@ static vsi_status op_compute } memset(&attr, 0, sizeof(attr)); - attr.size[0] = self->nn_param.reshape2.dim_num; + attr.size[0] = vsi_nn_max(self->nn_param.reshape2.dim_num, 1); attr.dim_num = 1; attr.is_const = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_INT32; @@ -161,13 +161,24 @@ static vsi_bool op_setup vsi_bool ret = TRUE; if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; - memcpy(shape, self->nn_param.reshape2.size, - sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num); - ret = vsi_nn_CalcReshapeTensor(inputs[0], - outputs[0], - shape, - self->nn_param.reshape2.dim_num); + if (self->nn_param.reshape2.dim_num == 0 || + self->nn_param.reshape2.size == NULL + ) + { + outputs[0]->attr.size[0] = 1; + outputs[0]->attr.dim_num = 1; + vsi_nn_SetTensorIsScalar(outputs[0], TRUE); + } + else + { + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + memcpy(shape, self->nn_param.reshape2.size, + sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num); + ret = vsi_nn_CalcReshapeTensor(inputs[0], + outputs[0], + shape, + self->nn_param.reshape2.dim_num); + } } return ret; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rope.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rope.c new file mode 100644 index 0000000..45c0307 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rope.c @@ -0,0 +1,145 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" +#include "utils/vsi_nn_dtype_util.h" +#include "vsi_nn_error.h" + +typedef struct _rope_local_data_t { + int32_t placeholder; +} rope_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t* param = NULL; + int32_t axis = self->nn_param.rope.axis; + vsi_bool interleaved = self->nn_param.rope.interleaved; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32(param, "axis", axis); + vsi_nn_kernel_param_add_int32(param, "interleaved", interleaved); + self->n = (vx_node)vsi_nn_kernel_selector(self->graph, "rope", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param); + + if ( self->n ) + { + status = VSI_SUCCESS; + } + if (param != NULL) + { + vsi_nn_kernel_param_release(¶m); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(ROPE, _INPUT_NUM, _OUTPUT_NUM) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_U16|Q_ASYM, D_U16|Q_ASYM, D_U16|Q_ASYM, D_U16|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_F16, D_F16, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_SYM, D_F16, D_F16, D_I16|Q_SYM) + IO_TYPE(D_U16|Q_ASYM, D_F16, D_F16, D_U16|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_SYM, D_F16, D_F16, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F16, D_F16, D_U8|Q_ASYM) + END_IO_TYPE_DECL(ROPE) + if (!VALIDATE_OP_IO_TYPES(ROPE, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ ROPE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c index 0d85eb1..70b22e7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c @@ -25,6 +25,7 @@ #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_platform.h" #include "vsi_nn_graph.h" #include "vsi_nn_node.h" @@ -188,7 +189,7 @@ static vsi_status op_optimize } if ( _need_split_softmax(self, inputs) == FALSE || self->nn_param.softmax_internal.axis != 0 || - self->graph->ctx->config.support_stream_processor ) + ((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ) { return status; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c index 7e8ae34..652e6c4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c @@ -39,6 +39,10 @@ #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" #include "vsi_nn_error.h" +typedef struct _topk_local_data_t { + vsi_bool use_internal_node; +} topk_local_data_t; + #define _INPUT_NUM (1) #define _OUTPUT_NUM (2) @@ -111,19 +115,43 @@ static vsi_status op_compute vsi_nn_tensor_t * out1_tensor = NULL; vsi_bool ret = FALSE; - if (inputs[0]->attr.size[axis] == 1) + if (self->nn_param.topk.local->use_internal_node) { return vsi_nn_internal_compute_node( self ); } - ret = vsi_nn_kernel_optimize_softmax_shape( - inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, - shapes[0], &rank_in, &new_axis0); + if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH) + { + int32_t i = 1; - ret = vsi_nn_kernel_optimize_softmax_shape( - outputs[0]->attr.size, outputs[0]->attr.dim_num, axis, - shapes[1], &rank_out, &new_axis1); + shapes[0][0] = inputs[0]->attr.size[0]; + shapes[1][0] = outputs[0]->attr.size[0]; + shapes[0][1] = 1; + shapes[1][1] = 1; + for (i = 1; i < (int32_t)(inputs[0]->attr.dim_num); i++) + { + shapes[0][1] = shapes[0][1] * inputs[0]->attr.size[i]; + } + for (i = 1; i < (int32_t)(outputs[0]->attr.dim_num); i++) + { + shapes[1][1] = shapes[1][1] * outputs[0]->attr.size[i]; + } + new_axis0 = axis; + new_axis1 = axis; + rank_in = 2; + rank_out = 2; + ret = TRUE; + } + else + { + ret = vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rank_in, &new_axis0); + ret = vsi_nn_kernel_optimize_softmax_shape( + outputs[0]->attr.size, outputs[0]->attr.dim_num, axis, + shapes[1], &rank_out, &new_axis1); + } if (ret) { uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0}; @@ -303,10 +331,12 @@ static vsi_bool op_setup vsi_nn_internal_tensor_t* const0_input = NULL; vsi_nn_tensor_attr_t attr; + p->local->use_internal_node = TRUE; + vsi_nn_internal_init_node_wksp(self); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); - curr->inputs[0] = inputs[0]; + curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; vsi_nn_internal_setup_node(self, curr); @@ -318,10 +348,42 @@ static vsi_bool op_setup CHECK_PTR_FAIL_GOTO(const0_input, "Create tensor failed", final); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); - curr->inputs[0] = const0_input->t; + curr->inputs[0] = const0_input->t; curr->outputs[0] = outputs[1]; vsi_nn_internal_setup_node(self, curr); } + else if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE) + { + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* temp_tensor = NULL; + vsi_nn_tensor_attr_t attr; + + p->local->use_internal_node = TRUE; + + vsi_nn_internal_init_node_wksp(self); + + memcpy(&attr, &inputs[0]->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + temp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(temp_tensor, "Create tensor failed", final); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_TOPK, 1, 2); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->node->nn_param.topk.axis = p->axis; + curr->node->nn_param.topk.k = p->k; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = temp_tensor->t; + curr->outputs[1] = outputs[1]; + vsi_nn_internal_setup_node(self, curr); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = temp_tensor->t; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } return TRUE; final: @@ -341,7 +403,7 @@ static vsi_status op_optimize VSI_UNREFERENCED(outputs); p = &(self->nn_param.topk); - if (inputs[0]->attr.size[p->axis] == 1) + if (p->local->use_internal_node) { return vsi_nn_internal_optimize_node( self, direction ); } @@ -357,6 +419,14 @@ static vsi_status op_init vsi_status status = VSI_SUCCESS; self->nn_param.topk.axis = 0; + self->nn_param.topk.local = \ + (topk_local_data_t*)malloc(sizeof(topk_local_data_t)); + if (NULL == self->nn_param.topk.local) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.topk.local, 0, sizeof(topk_local_data_t)); + return status; } /* op_init() */ @@ -365,7 +435,12 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - vsi_nn_internal_deinit_node_wksp(self); + if (self->nn_param.topk.local->use_internal_node) + { + vsi_nn_internal_deinit_node_wksp(self); + } + + vsi_nn_safe_free(self->nn_param.topk.local); vsi_nn_op_common_deinit(self); return VSI_SUCCESS; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index feaa0fc..d411a6b 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -475,6 +475,7 @@ static _op_param_gen_t s_op_gen[] = /* GROUPED_CONV3D */ NULL, /* COL2IM */ NULL, /* L1_LAYER_NORM */ NULL, + /* ROPE */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c index ac4aa2a..e59bc81 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c @@ -98,7 +98,7 @@ static VSI_INLINE_API void _convert_bfloat16_to_float uint32_t i; for( i = 0; i < size; i ++ ) { - out_buffer[i] = bfp16_to_fp32( (int16_t)buffer[i] ); + out_buffer[i] = bfp16_to_fp32( (uint16_t)buffer[i] ); } } /* _convert_bfloat16_to_float */ diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index 3a40e10..969ca51 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -40,6 +40,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_graph.h" #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "vsi_nn_log.h" @@ -1261,7 +1262,9 @@ vsi_bool vsi_nn_is_same_quant_type( break; } #ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT - case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: { + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC: + { const float diff = (float)1e-5; int32_t i = 0; int32_t scale_cnt0 = src_dtype->group_count; @@ -1627,12 +1630,12 @@ vsi_bool vsi_nn_is_stream_process_supported_types { size_t i = 0; - if ( graph->ctx->config.support_stream_processor == 0 ) + if ( ((vsi_nn_graph_prv_t*)graph)->options->config.support_stream_processor == 0 ) { return FALSE; } - if ( graph->ctx->config.sp_exec_count == 0 ) + if ( ((vsi_nn_graph_prv_t*)graph)->options->config.sp_exec_count == 0 ) { return FALSE; } @@ -1769,3 +1772,11 @@ typedef enum return support; } + +uint32_t vsi_nn_get_tensor_dims + ( + vsi_nn_tensor_t* tensor + ) +{ + return vsi_nn_GetTensorIsScalar(tensor) ? 0 : tensor->attr.dim_num; +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index 4fd9be7..e669a2c 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -39,6 +39,9 @@ static vsi_status query_hardware_caps #endif #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT vx_hardware_caps_params_ext_t paramExt; +#if VX_FIXED_FUNCTION_DEVICE_SUPPORT + vx_hardware_caps_params_ext3_t paramExt3; +#endif memset(¶mExt, 0, sizeof(vx_hardware_caps_params_ext_t)); status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt), @@ -73,6 +76,13 @@ static vsi_status query_hardware_caps } #endif +#if VX_FIXED_FUNCTION_DEVICE_SUPPORT + memset(¶mExt3, 0, sizeof(vx_hardware_caps_params_ext3_t)); + status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt3), + sizeof(vx_hardware_caps_params_ext3_t)); + context->config.support_ffd = paramExt3.supportFixedFunctionDevice; +#endif + #endif if(param.evis1 == TRUE && param.evis2 == FALSE) @@ -93,6 +103,85 @@ final: return status; } +vsi_status query_hardware_caps_runtime + ( + vsi_nn_context_t context, + vsi_nn_runtime_option_t* options + ) +{ + vsi_status status = VSI_FAILURE; + vx_hardware_caps_params_t param; + VSI_UNREFERENCED(options); + memset(&(options->config), 0, sizeof(vsi_nn_hw_config_t)); +#if VX_STREAM_PROCESSOR_SUPPORT + vx_hardware_caps_params_ext2_t paramExt2; +#endif +#if VX_FIXED_FUNCTION_DEVICE_SUPPORT + vx_hardware_caps_params_ext3_t paramExt3; +#endif +#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT + vx_hardware_caps_params_ext_t paramExt; + + memset(¶mExt, 0, sizeof(vx_hardware_caps_params_ext_t)); + status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt), + sizeof(vx_hardware_caps_params_ext_t)); + param.evis1 = paramExt.base.evis1; + param.evis2 = paramExt.base.evis2; +#else + memset(¶m, 0, sizeof(vx_hardware_caps_params_t)); + status = vxQueryHardwareCaps(context->c, ¶m, sizeof(vx_hardware_caps_params_t)); +#endif + TEST_CHECK_STATUS(status, final); + +#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT + options->config.subGroupSize = paramExt.subGroupSize; +#ifdef VSI_40BIT_VA_SUPPORT + options->config.use_40bits_va = paramExt.supportVA40; +#endif +#if VX_STREAM_PROCESSOR_SUPPORT + memset(¶mExt2, 0, sizeof(vx_hardware_caps_params_ext2_t)); + status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt2), + sizeof(vx_hardware_caps_params_ext2_t)); + if (options->enable_stream_processor) + { + options->config.support_stream_processor = paramExt.supportStreamProcessor; + options->config.sp_exec_count = paramExt2.streamProcessorExecCount; + options->config.sp_vector_depth = paramExt2.streamProcessorVectorSize; + if (options->config.sp_exec_count > 0) + { + options->config.sp_per_core_vector_depth = + options->config.sp_vector_depth / options->config.sp_exec_count; + } + } +#endif + +#if VX_FIXED_FUNCTION_DEVICE_SUPPORT + memset(¶mExt3, 0, sizeof(vx_hardware_caps_params_ext3_t)); + status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt3), + sizeof(vx_hardware_caps_params_ext3_t)); + options->config.support_ffd = paramExt3.supportFixedFunctionDevice; +#endif + +#endif + + if(param.evis1 == TRUE && param.evis2 == FALSE) + { + options->config.evis.ver = VSI_NN_HW_EVIS_1; + } + else if(param.evis1 == FALSE && param.evis2 == TRUE) + { + options->config.evis.ver = VSI_NN_HW_EVIS_2; + } + else + { + options->config.evis.ver = VSI_NN_HW_EVIS_NONE; + VSILOGW("Unsupported evis version"); + } + +final: + return status; +} + #if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30)) static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER"; static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK"; @@ -153,6 +242,44 @@ vsi_status vsi_nn_initOptions return VSI_SUCCESS; } +vsi_status vsi_nn_initOptions_runtime + ( + vsi_nn_runtime_option_t *options, + vsi_nn_context_t ctx + ) +{ + int32_t default_value = 1; + + options->enable_shader = vsi_nn_getenv_asint(ENV_ENABLE_SHADER, 1); + options->enable_opcheck = vsi_nn_getenv_asint(ENV_ENABLE_OPCHECK, 1); +#if (VX_CONCAT_OPT_SUPPORT) + default_value = 0; +#else + default_value = 1; +#endif + options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value); + options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1); + options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1); + options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1); + options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0); +#if (VX_STRIDED_SLICE_OPT_SUPPORT) + default_value = 0; +#else + default_value = 1; +#endif + options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value); + options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0); + options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0); + options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1); + options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1); + + /*init hw params*/ + options->config = ctx->config; + + return VSI_SUCCESS; +} + + vsi_nn_context_t vsi_nn_CreateContext ( void ) { diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index 85cad88..2ee8f0b 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -1362,7 +1362,7 @@ vsi_nn_graph_t * vsi_nn_CreateGraph graph->isAllowFastMode = TRUE; vsi_nn_MapInit( graph->node_table ); vsi_nn_MapInit( graph->tensor_table ); - vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options ); + vsi_nn_initOptions_runtime( ((vsi_nn_graph_prv_t*) graph)->options, ctx ); } else { @@ -3398,6 +3398,7 @@ char* vsi_nn_GetRunTimeVariable #define varSize 256 char* value_str = (char*)malloc(sizeof(char) * varSize); CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final); + CHECK_PTR_FAIL_GOTO(graph, "Graph is NULL!", final); memset(value_str, 0, varSize); char tmp_value[varSize] = {0}; VSI_UNREFERENCED(tmp_value); @@ -3502,6 +3503,8 @@ vsi_status vsi_nn_SetRunTimeVariable break; case VSI_VX_ENABLE_STREAM_PROCESSOR: options->enable_stream_processor = atoi(value); + options->config.support_stream_processor = atoi(value); + status = query_hardware_caps_runtime(graph->ctx, options); break; case VSI_VX_ENABLE_BATCH_OPT: options->enable_batch_opt = atoi(value); diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c index c017ea5..9cbe72a 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -895,10 +895,13 @@ static void _convert_const_I8toU8 attr->dtype.vx_type = VSI_NN_TYPE_UINT8; attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; attr->dtype.zero_point += 128; - - if ( tensor->t ) vxReleaseTensor(&tensor->t); + if (tensor->t) vxReleaseTensor(&tensor->t); tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr); - +#if defined(VSI_TENSOR_SPARSITY_SUPPORT) + int32_t is_sparsity = 0; + is_sparsity = vsi_nn_GetTensorIsSparsity(tensor); + vsi_nn_SetTensorIsSparsity(tensor, is_sparsity); +#endif final: vsi_nn_safe_free( data ); }/* _convert_const_I8toU8() */ diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index c30d031..f9c66bb 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -247,7 +247,8 @@ static void _set_preproc_node_input_attr vsi_nn_tensor_attr_t* attr, vsi_nn_preprocess_image_size_t* input_size, vsi_nn_preprocess_source_format_e* source_format, - vsi_nn_preprocess_source_layout_e* source_layout + vsi_nn_preprocess_source_layout_e* source_layout, + vsi_nn_preprocess_dtype_convert_t* data_convert ) { *input_attr = *attr; @@ -266,26 +267,33 @@ static void _set_preproc_node_input_attr } if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR) { - input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32; + if(data_convert != NULL) + { + input_attr->dtype = data_convert->dtype; + } + else + { + input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } } else { input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; input_attr->dtype.vx_type = VSI_NN_TYPE_UINT8; } - if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB) + if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB) { - if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) + if (*source_layout == VSI_NN_SOURCE_LAYOUT_NCHW) { - input_attr->size[0] = input_attr->size[1]*input_attr->size[0]; - input_attr->size[1] = input_attr->size[2]; - input_attr->size[2] = 1; - } - else - { - input_attr->size[0] = input_attr->size[2]*input_attr->size[0]; - input_attr->size[2] = 1; + vsi_size_t channel = input_attr->size[2]; + if (channel != 3) + { + VSILOGE("RGB chanel must be 3, please have a check!"); + } + input_attr->size[2] = input_attr->size[1]; + input_attr->size[1] = input_attr->size[0]; + input_attr->size[0] = channel; } } @@ -333,15 +341,10 @@ static void _set_preproc_node_input_attr static void _set_preproc_node_output_attr ( vsi_nn_tensor_attr_t* output_attr, - vsi_nn_tensor_attr_t* attr, - vsi_nn_preprocess_dtype_convert_t* data_convert + vsi_nn_tensor_attr_t* attr ) { *output_attr = *attr; - if(data_convert != NULL) - { - output_attr->dtype = data_convert->dtype; - } output_attr->dtype.fmt = VSI_NN_DIM_FMT_NCHW; output_attr->dim_num = VSI_NN_DIM_AUTO; output_attr->is_const = FALSE; @@ -603,10 +606,11 @@ vsi_status vsi_nn_add_single_preproc_node _set_preproc_node_out_attr(node, image_resize, &org_norm_tensor->attr, source_layout); /* Set input tensor attr */ - _set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size, source_format, source_layout); + _set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size, + source_format, source_layout, data_convert); /* Set output tensor attr */ - _set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr, data_convert); + _set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr); /* Create new norm and virtual tensors */ if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c index 44ab53e..419cf2d 100644 --- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c +++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c @@ -33,6 +33,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_util.h" #include "vsi_nn_rnn_helper.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_error.h" vsi_bool vsi_nn_rnn_find_best_kernel_size @@ -804,7 +805,7 @@ vsi_status vsi_nn_rnn_data_check_aligned vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size, input[i]->attr.dim_num, input[i]->attr.dtype.vx_type ); - if( ofst & 0x3f && !self->graph->ctx->config.support_stream_processor) + if( ofst & 0x3f && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor) { vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index 179755f..6a40412 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -155,6 +155,15 @@ static void print_tensor tensor->attr.dtype.group_size); ext_attr[count] = 0; break; + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC: + count = snprintf(&ext_attr[0], + _EXT_ATTR_BUF_SZ, + "ASYM GPTQ axis=%d, count=%d, group_size=%d", + tensor->attr.dtype.group_channel_dim, + tensor->attr.dtype.group_count, + tensor->attr.dtype.group_size); + ext_attr[count] = 0; + break; #endif default: vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ); @@ -449,6 +458,11 @@ static vsi_bool _init_tensor scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count); CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final ); memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float)); + zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim); + CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final ); + memcpy(zeroPoints, + tensor->attr.dtype.zero_points, + tensor->attr.dtype.zero_points_dim * sizeof(int32_t)); params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim; params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size; params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count; @@ -460,6 +474,32 @@ static vsi_bool _init_tensor VSILOGE( "can't support qnt_type " "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC."); + break; +#endif + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC: +#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT + params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP; + // This is a hack that driver doesn't support const scales + scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count); + CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final ); + memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float)); + zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim); + CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final ); + memcpy(zeroPoints, + tensor->attr.dtype.group_zero_points, + tensor->attr.dtype.group_count * sizeof(int32_t)); + params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim; + params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size; + params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count; + params.quant_data.affinePerGroup.scales = scales; + params.quant_data.affinePerGroup.zero_points = zeroPoints; + params.quant_data.affinePerGroup.zero_point_group_count = tensor->attr.dtype.group_count; + break; +#else + VSILOGE( + "can't support qnt_type " + "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC."); + break; #endif default: break; @@ -1788,6 +1828,57 @@ int8_t vsi_nn_GetTensorIsScalar return _get_tensor_is_scalar((vsi_nn_tensor_prv_t*)tensor); } +int32_t _get_tensor_is_sparsity +( + vsi_nn_tensor_prv_t* tensor +) +{ + int32_t is_sparsity = FALSE; + if (NULL == tensor) + { + VSILOGE("To get is_sparsity, tensor pointer SHOULD NOT be NULL."); + goto final; + } +#if defined(VSI_TENSOR_SPARSITY_SUPPORT) + is_sparsity = tensor->sparsity_type; +#endif +final: + return is_sparsity; +} + +int32_t vsi_nn_GetTensorIsSparsity +( + vsi_nn_tensor_t* tensor +) +{ + return _get_tensor_is_sparsity((vsi_nn_tensor_prv_t*)tensor); +} + +vsi_status vsi_nn_SetTensorIsSparsity +( + vsi_nn_tensor_t* tensor, + int32_t is_sparsity +) +{ + VSI_UNREFERENCED(is_sparsity); + vsi_status status = VSI_SUCCESS; + if (NULL == tensor) { + status = VSI_FAILURE; + goto final; + } +#if defined(VSI_TENSOR_SPARSITY_SUPPORT) + vxSetTensorAttribute(tensor->t, + VX_TENSOR_SPARSITY_TYPE, + &is_sparsity, + sizeof(vx_enum)); + status = VSI_SUCCESS; + ((vsi_nn_tensor_prv_t*)tensor)->sparsity_type = is_sparsity; +#endif +final: + return status; +} + + vsi_status vsi_nn_CopyRawDataToTensor ( vsi_nn_graph_t* graph, diff --git a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h index c041c65..a7dcf56 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h +++ b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h @@ -75,6 +75,11 @@ vsi_status _set_tensor_is_scalar int8_t is_salar ); +vsi_status _set_tensor_is_sparsity( + vsi_nn_tensor_prv_t* tensor, + int32_t is_sparsity +); + int8_t _get_tensor_is_from_axisram ( vsi_nn_tensor_prv_t* tensor @@ -127,6 +132,11 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node vsi_nn_opt_direction_e direction ); +uint32_t vsi_nn_get_tensor_dims + ( + vsi_nn_tensor_t* tensor + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/src/vsi_nn_types_prv.h b/src/tim/vx/internal/src/vsi_nn_types_prv.h index 4f9fd0b..5d89b0b 100644 --- a/src/tim/vx/internal/src/vsi_nn_types_prv.h +++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h @@ -108,6 +108,11 @@ typedef struct _vsi_nn_tensor_prv /** create tensor from axisram.*/ int8_t is_from_axisram; + /** 2:4 sparsity attr. */ +#if defined(VSI_TENSOR_SPARSITY_SUPPORT) + vx_tensor_sparsity_param_e sparsity_type; /*!< \brief sparsity type for the tensor */ +#endif + // Add tensor internal attribute here... } vsi_nn_tensor_prv_t;