Update internal ovxlib to release/1.2.22 (#706)

* Update internal ovxlib to release/1.2.22

Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>

* Refine yaml file for blocking tfhub model tests

Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>

---------

Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
Chen Feiyue 2025-01-08 13:22:46 +08:00 committed by GitHub
parent 149834832c
commit 8494275d76
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
94 changed files with 9466 additions and 3885 deletions

View File

@ -124,7 +124,7 @@ jobs:
run: | run: |
git config --global user.email "xiang.zhang@verisilicon.com" git config --global user.email "xiang.zhang@verisilicon.com"
git config --global user.name "xiang.zhang" git config --global user.name "xiang.zhang"
git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.10.0 git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.16.1
git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate
cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF
cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}} cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}}
@ -283,61 +283,61 @@ jobs:
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite
tfhub-efficientdet-lite0: # tfhub-efficientdet-lite0:
runs-on: ubuntu-latest # runs-on: ubuntu-latest
needs: [vx-delegate-build, tim-vx-unit-test] # needs: [vx-delegate-build, tim-vx-unit-test]
steps: # steps:
- name: download test binary # - name: download test binary
uses: actions/download-artifact@v3 # uses: actions/download-artifact@v3
- name: download model # - name: download model
run: | # run: |
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite # wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
- name: benchmark-model # - name: benchmark-model
run: | # run: |
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
tfhub-efficientdet-lite1: # tfhub-efficientdet-lite1:
runs-on: ubuntu-latest # runs-on: ubuntu-latest
needs: [vx-delegate-build, tim-vx-unit-test] # needs: [vx-delegate-build, tim-vx-unit-test]
steps: # steps:
- name: download test binary # - name: download test binary
uses: actions/download-artifact@v3 # uses: actions/download-artifact@v3
- name: download model # - name: download model
run: | # run: |
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite # wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
- name: benchmark-model # - name: benchmark-model
run: | # run: |
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
tfhub-efficientdet-lite2: # tfhub-efficientdet-lite2:
runs-on: ubuntu-latest # runs-on: ubuntu-latest
needs: [vx-delegate-build, tim-vx-unit-test] # needs: [vx-delegate-build, tim-vx-unit-test]
steps: # steps:
- name: download test binary # - name: download test binary
uses: actions/download-artifact@v3 # uses: actions/download-artifact@v3
- name: download model # - name: download model
run: | # run: |
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite # wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
- name: benchmark-model # - name: benchmark-model
run: | # run: |
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
tfhub-efficientdet-lite3: # tfhub-efficientdet-lite3:
runs-on: ubuntu-latest # runs-on: ubuntu-latest
needs: [vx-delegate-build, tim-vx-unit-test] # needs: [vx-delegate-build, tim-vx-unit-test]
steps: # steps:
- name: download test binary # - name: download test binary
uses: actions/download-artifact@v3 # uses: actions/download-artifact@v3
- name: download model # - name: download model
run: | # run: |
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite # wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
- name: benchmark-model # - name: benchmark-model
run: | # run: |
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model # chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite # ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
# acuity-yolov3-608-quant: # acuity-yolov3-608-quant:
# runs-on: ubuntu-latest # runs-on: ubuntu-latest

View File

@ -1 +1 @@
1.2.14 1.2.22

View File

@ -9,3 +9,4 @@ DEF_NODE_TYPE(custom_sample)
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess) DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence) DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box) DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
DEF_NODE_TYPE(custom_letterbox)

View File

@ -9,3 +9,4 @@ DEF_OP(CUSTOM_SAMPLE)
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS) DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE) DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX) DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
DEF_OP(CUSTOM_LETTERBOX)

View File

@ -0,0 +1,61 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_CUSTOM_LETTERBOX_H
#define _VSI_NN_OP_CUSTOM_LETTERBOX_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_custom_letterbox_param
{
struct _custom_letterbox_local_data_t* local;
int32_t new_shape_w;
int32_t new_shape_h;
vx_bool auto_bool;
vx_bool scaleFill;
vx_bool scaleup;
int32_t stride;
vx_bool center;
float mean_r;
float mean_g;
float mean_b;
float scale_r;
float scale_g;
float scale_b;
int32_t pad_value_r;
int32_t pad_value_g;
int32_t pad_value_b;
vx_bool reverse_channel;
} vsi_nn_custom_letterbox_param;
_compiler_assert(offsetof(vsi_nn_custom_letterbox_param, local) == 0, \
vsi_nn_custom_lertterbox_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -34,5 +34,6 @@
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h" #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h" #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h" #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
#include "custom/ops/vsi_nn_op_custom_letterbox.h"
#endif #endif

View File

@ -203,3 +203,4 @@ DEF_OP(BITCAST)
DEF_OP(GROUPED_CONV3D) DEF_OP(GROUPED_CONV3D)
DEF_OP(COL2IM) DEF_OP(COL2IM)
DEF_OP(L1_LAYER_NORM) DEF_OP(L1_LAYER_NORM)
DEF_OP(ROPE)

View File

@ -80,7 +80,7 @@ typedef struct _vsi_nn_pre_process_rgb_param
float g_scale; float g_scale;
float b_scale; float b_scale;
/* pre process rgb layer local data structure */ /* pre process rgb layer local data structure */
vsi_nn_pre_process_rgb_lcl_data local; vsi_nn_pre_process_rgb_lcl_data *local;
} vsi_nn_pre_process_rgb_param; } vsi_nn_pre_process_rgb_param;
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -0,0 +1,49 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_ROPE_H
#define _VSI_NN_OP_ROPE_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_rope_param
{
struct _rope_local_data_t* local;
// Add parameters here
int32_t axis;
vsi_bool interleaved;
} vsi_nn_rope_param;
_compiler_assert(offsetof(vsi_nn_rope_param, local) == 0, \
vsi_nn_rope_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -34,6 +34,7 @@ typedef struct _vsi_nn_topk_param
{ {
uint32_t k; uint32_t k;
int32_t axis; int32_t axis;
struct _topk_local_data_t* local;
} vsi_nn_topk_param; } vsi_nn_topk_param;
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -384,25 +384,17 @@ static VSI_INLINE_API float fp16_to_fp32
static VSI_INLINE_API float bfp16_to_fp32 static VSI_INLINE_API float bfp16_to_fp32
( (
int16_t in uint16_t in
) )
{ {
uint32_t t1, t2, t3;
float out; float out;
fp32_bit_cast_t fp32_bit_cast; fp32_bit_cast_t fp32_bit_cast;
t1 = in & 0x00FF; // Mantissa fp32_bit_cast.data = (uint32_t)(in << 16);
t2 = in & 0xFF00; // Sign bit + Exponent
t3 = in & 0x7F00; // Exponent
t1 <<= 16;
t2 <<= 16; // Shift (sign + Exponent) bit into position
t1 |= t2; // Re-insert (sign + Exponent) bit
fp32_bit_cast.data = t1;
out = fp32_bit_cast.val; out = fp32_bit_cast.val;
return t3 == 0 ? 0.0f : out; return out;
} /* bfp16_to_fp32() */ } /* bfp16_to_fp32() */
static VSI_INLINE_API uint16_t fp32_to_fp16 static VSI_INLINE_API uint16_t fp32_to_fp16
@ -720,7 +712,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32
*dst = fp16_to_fp32( *(int16_t *)src ); *dst = fp16_to_fp32( *(int16_t *)src );
break; break;
case VSI_NN_TYPE_BFLOAT16: case VSI_NN_TYPE_BFLOAT16:
*dst = bfp16_to_fp32( *(int16_t *)src ); *dst = bfp16_to_fp32( *(uint16_t *)src );
break; break;
case VSI_NN_TYPE_FLOAT8_E4M3: case VSI_NN_TYPE_FLOAT8_E4M3:
*dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale); *dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);

File diff suppressed because it is too large Load Diff

View File

@ -61,14 +61,13 @@ typedef struct _vsi_nn_hw_config_t
{ {
char target_name[VSI_NN_MAX_TARGET_NAME]; char target_name[VSI_NN_MAX_TARGET_NAME];
vsi_nn_hw_evis_t evis; vsi_nn_hw_evis_t evis;
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
uint32_t subGroupSize; uint32_t subGroupSize;
#endif
uint32_t use_40bits_va; uint32_t use_40bits_va;
uint32_t support_stream_processor; uint32_t support_stream_processor;
uint32_t sp_exec_count; uint32_t sp_exec_count;
uint32_t sp_vector_depth; uint32_t sp_vector_depth;
uint32_t sp_per_core_vector_depth; uint32_t sp_per_core_vector_depth;
uint32_t support_ffd;
} vsi_nn_hw_config_t; } vsi_nn_hw_config_t;
typedef struct _vsi_nn_runtime_option_t typedef struct _vsi_nn_runtime_option_t
@ -89,6 +88,7 @@ typedef struct _vsi_nn_runtime_option_t
int32_t enable_save_file_type; int32_t enable_save_file_type;
int32_t enable_use_image_process; int32_t enable_use_image_process;
int32_t enable_use_from_handle; int32_t enable_use_from_handle;
vsi_nn_hw_config_t config;
} vsi_nn_runtime_option_t; } vsi_nn_runtime_option_t;
/** /**
@ -101,6 +101,15 @@ typedef struct _vsi_nn_context_t
vsi_nn_runtime_option_t options; vsi_nn_runtime_option_t options;
} VSI_PUBLIC_TYPE *vsi_nn_context_t; } VSI_PUBLIC_TYPE *vsi_nn_context_t;
/**
* Query and set options->config hw params.
*/
OVXLIB_API vsi_status query_hardware_caps_runtime
(
vsi_nn_context_t ctx,
vsi_nn_runtime_option_t *options
);
/** /**
* Create context * Create context
* Create ovxlib NN runtime context. * Create ovxlib NN runtime context.
@ -113,6 +122,11 @@ OVXLIB_API vsi_status vsi_nn_initOptions
( (
vsi_nn_runtime_option_t *options vsi_nn_runtime_option_t *options
); );
OVXLIB_API vsi_status vsi_nn_initOptions_runtime
(
vsi_nn_runtime_option_t *options,
vsi_nn_context_t ctx
);
/** /**
* Release context * Release context
* Release ovxlib NN runtime resource and reset context handle to NULL. * Release ovxlib NN runtime resource and reset context handle to NULL.

View File

@ -57,5 +57,8 @@
#define VSI_PER_GROUP_QUANTIZATION_SUPPORT #define VSI_PER_GROUP_QUANTIZATION_SUPPORT
#endif #endif
#define VSI_GRAPH_RUNTIME_ENV_SUPPORT #define VSI_GRAPH_RUNTIME_ENV_SUPPORT
#if defined(VX_TENSOR_SPARSITY_SUPPORT)
#define VSI_TENSOR_SPARSITY_SUPPORT
#endif
#endif #endif

View File

@ -216,6 +216,7 @@
#include "ops/vsi_nn_op_grouped_conv3d.h" #include "ops/vsi_nn_op_grouped_conv3d.h"
#include "ops/vsi_nn_op_col2im.h" #include "ops/vsi_nn_op_col2im.h"
#include "ops/vsi_nn_op_l1_layer_norm.h" #include "ops/vsi_nn_op_l1_layer_norm.h"
#include "ops/vsi_nn_op_rope.h"
/* custom node head define define */ /* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h" #include "custom/vsi_nn_custom_node_type.h"
#include "ops/vsi_nn_op_inverse_sigmoid.h" #include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -420,6 +421,7 @@ typedef union _vsi_nn_nn_param
vsi_nn_grouped_conv3d_param grouped_conv3d; vsi_nn_grouped_conv3d_param grouped_conv3d;
vsi_nn_col2im_param col2im; vsi_nn_col2im_param col2im;
vsi_nn_l1_layer_norm_param l1_layer_norm; vsi_nn_l1_layer_norm_param l1_layer_norm;
vsi_nn_rope_param rope;
void* client_param; void* client_param;
/* custom node data struct define */ /* custom node data struct define */

View File

@ -86,8 +86,10 @@ typedef enum
VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6, VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
/** perchannel float8 */ /** perchannel float8 */
VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7, VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
/** GPQT */ /** pergroup symmetric */
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8, VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
/** pergroup asymmetric */
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC = 0x9,
/** undefined type */ /** undefined type */
VSI_NN_QNT_TYPE_NA = 0xff, VSI_NN_QNT_TYPE_NA = 0xff,
} vsi_nn_qnt_type_e; } vsi_nn_qnt_type_e;

View File

@ -418,6 +418,34 @@ OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar
int8_t is_scalar int8_t is_scalar
); );
/**
* Get Tensor is_scalar
* Get the is_sparsity of the tensor
*
* @param[in] tensor Tensor.
*
* @return is_sparsity flag of the tensor.
*/
OVXLIB_API int32_t vsi_nn_GetTensorIsSparsity
(
vsi_nn_tensor_t* tensor
);
/**
* Set Weight Tensor whether is sparsity
* Set the is_sparsity for the tensor
*
* @param[in] tensor Tensor.
* @param[in] new is_sparsity value of the tensor.
*
* @return VSI_SUCCESS on success, or error core otherwise.
**/
OVXLIB_API vsi_status vsi_nn_SetTensorIsSparsity(
vsi_nn_tensor_t* tensor,
int32_t is_sparsity
);
OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
( (
vsi_nn_graph_t* graph, vsi_nn_graph_t* graph,

View File

@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 2 #define VSI_NN_VERSION_MINOR 2
#define VSI_NN_VERSION_PATCH 14 #define VSI_NN_VERSION_PATCH 22
#define VSI_NN_VERSION \ #define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -0,0 +1,475 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _CUSTOM_LETTERBOX_KERNEL_SOURCE "custom_letterbox"
// Add kernel hashtable here
#define CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
(( IN_DTYPE ) | ( OUT_DTYPE << 8 ))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
CVIVANTE_NAMESPACE("evis.custom_letterbox_"#IN_DTYPE"to"#OUT_DTYPE), \
_CUSTOM_LETTERBOX_KERNEL_SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _custom_letterbox_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( U8, U8 ),
PACK_KERNEL_MAP( U8, I8 ),
PACK_KERNEL_MAP( U8, F16 ),
};
/*
* Kernel params
*/
static vx_param_description_t _custom_letterbox_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CUSTOM_LETTERBOX_PARAM_NUM _cnt_of_array( _custom_letterbox_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_custom_letterbox_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
2,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
VSI_UNREFERENCED(param_size);
int32_t top = 0;
int32_t bottom = 0;
int32_t left = 0;
int32_t right = 0;
float scale_w = 0;
float scale_h = 0;
int32_t resize_w = 0;
int32_t resize_h = 0;
int32_t resize_max_w = 0;
int32_t resize_max_h = 0;
float output_scale = 1.0f;
float output_zp = 0;
float out_scale_r = 0;
float out_zp_r = 0;
float out_scale_g = 0;
float out_zp_g = 0;
float out_scale_b = 0;
float out_zp_b = 0;
float pad_v_r = 0;
float pad_v_g = 0;
float pad_v_b = 0;
int32_t in_width = 0;
int32_t in_height = 0;
int32_t out_width = 0;
int32_t out_height = 0;
float mean_r = 0;
float mean_g = 0;
float mean_b = 0;
float scale_r = 0;
float scale_g = 0;
float scale_b = 0;
vx_int32 pad_value_r = 0;
vx_int32 pad_value_g = 0;
vx_int32 pad_value_b = 0;
vx_int32 r_order = 0;
vx_int32 b_order = 0;
vx_int32 reverse_channel = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &top);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &bottom);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &left);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &right);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &mean_r);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &mean_g);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &mean_b);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &scale_r);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &scale_g);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[11], &scale_b);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_value_r);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_value_g);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &pad_value_b);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &reverse_channel);
CHECK_STATUS_FAIL_GOTO(status, final );
in_width = (int32_t)attr[0]->shape->data[0] / 3;
in_height = (int32_t)attr[0]->shape->data[1];
out_width = (int32_t)attr[1]->shape->data[0];
out_height = (int32_t)attr[1]->shape->data[1] / 3;
output_scale = 1.0f / attr[1]->scale;
output_zp = (float)(attr[1]->zero_point);
resize_w = out_width - left - right;
resize_h = out_height - top - bottom;
resize_max_w = out_width - right;
resize_max_h = out_height - bottom;
scale_w = (float)in_width / resize_w;
scale_h = (float)in_height / resize_h;
out_scale_r = scale_r / output_scale;
out_zp_r = output_zp - out_scale_r * mean_r;
out_scale_g = scale_g / output_scale;
out_zp_g = output_zp - out_scale_g * mean_g;
out_scale_b = scale_b / output_scale;
out_zp_b = output_zp - out_scale_b * mean_b;
pad_v_r = pad_value_r * out_scale_r + out_zp_r;
pad_v_g = pad_value_g * out_scale_g + out_zp_g;
pad_v_b = pad_value_b * out_scale_b + out_zp_b;
if (reverse_channel)
{
r_order = out_height * 2;
b_order = 0;
}
else
{
r_order = 0;
b_order = out_height * 2;
}
{
gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
0x00090909, // TCfg
0x00000000, // ASelt
0x00140003, 0x00000025, // ABin
0x000a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniLeftToFloat32_4x4 = {{
0x00010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00000002, // ABin
0x00020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtactHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtract8Data_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFloat32_4x4", &uniLeftToFloat32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8 );
}
status |= vsi_nn_kernel_gpu_add_param( node, "top", &top );
status |= vsi_nn_kernel_gpu_add_param( node, "left", &left );
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_r", &out_scale_r );
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_g", &out_scale_g );
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_b", &out_scale_b );
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_r", &out_zp_r );
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_g", &out_zp_g );
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_b", &out_zp_b );
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_r", &pad_v_r );
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_g", &pad_v_g );
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_b", &pad_v_b );
status |= vsi_nn_kernel_gpu_add_param( node, "scale_w", &scale_w );
status |= vsi_nn_kernel_gpu_add_param( node, "scale_h", &scale_h );
status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_w", &resize_max_w );
status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_h", &resize_max_h );
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height );
status |= vsi_nn_kernel_gpu_add_param( node, "r_order", &r_order );
status |= vsi_nn_kernel_gpu_add_param( node, "b_order", &b_order );
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_size[0] = out_width;
gpu_param.global_size[1] = out_height;
status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _custom_warp_affine_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _custom_letterbox_kernel_map;
size_t kernel_map_size = _cnt_of_array( _custom_letterbox_kernel_map );
vx_param_description_t * param_def = _custom_letterbox_kernel_param_def;
size_t param_def_size = _cnt_of_array( _custom_letterbox_kernel_param_def );
vx_kernel_initialize_f initializer = _custom_letterbox_initializer;
uint32_t key = 0;
uint32_t i = 0;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = CUSTOM_LETTERBOX_HASH_KEY( in_dtype, out_dtype );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (vx_uint32)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CUSTOM_LETTERBOX_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
size_t i = 0;
int32_t top = vsi_nn_kernel_param_get_int32( params, "top");
int32_t bottom = vsi_nn_kernel_param_get_int32( params, "bottom");
int32_t left = vsi_nn_kernel_param_get_int32( params, "left");
int32_t right = vsi_nn_kernel_param_get_int32( params, "right");
float mean_r = vsi_nn_kernel_param_get_float32( params, "mean_r");
float mean_g = vsi_nn_kernel_param_get_float32( params, "mean_g");
float mean_b = vsi_nn_kernel_param_get_float32( params, "mean_b");
float scale_r = vsi_nn_kernel_param_get_float32( params, "scale_r");
float scale_g = vsi_nn_kernel_param_get_float32( params, "scale_g");
float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b");
int32_t pad_value_r = vsi_nn_kernel_param_get_int32( params, "pad_value_r");
int32_t pad_value_g = vsi_nn_kernel_param_get_int32( params, "pad_value_g");
int32_t pad_value_b = vsi_nn_kernel_param_get_int32( params, "pad_value_b");
int32_t reverse_channel = vsi_nn_kernel_param_get_int32( params, "reverse_channel");
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
uint32_t param_num = _CUSTOM_LETTERBOX_PARAM_NUM;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
shapes[0][0] = inputs[0]->attr.size[1] * 3;
shapes[0][1] = inputs[0]->attr.size[2];
shapes[1][0] = outputs[0]->attr.size[0];
shapes[1][1] = outputs[0]->attr.size[1] * 3;
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], 2 );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[1], 2 );
if (reshape_tensors[0] == NULL ||
reshape_tensors[1] == NULL)
{
goto final;
}
if (reverse_channel)
{
float mean_temp = mean_r;
float scale_temp = scale_r;
int32_t pad_value_temp = pad_value_r;
mean_r = mean_b;
mean_b = mean_temp;
scale_r = scale_b;
scale_b = scale_temp;
pad_value_r = pad_value_b;
pad_value_b = pad_value_temp;
}
status = _query_kernel( kernel, inputs, outputs );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 2;
vsi_nn_kernel_node_pack_io( node_params, param_num,
reshape_tensors, 1, &reshape_tensors[1], 1 );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bottom );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &right );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_r );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_g );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_b );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_r );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_g );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_r );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_g );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_b );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse_channel );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
vsi_nn_kernel_scalar_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
vsi_nn_kernel_scalar_release( &node_params[14] );
vsi_nn_kernel_scalar_release( &node_params[15] );
CHECK_STATUS(status);
}
}
final:
for (i = 0; i < 2; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( custom_letterbox, _setup )

View File

@ -35,6 +35,7 @@
#include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#define _CPU_ARG_NUM (1) #define _CPU_ARG_NUM (1)
#define _CPU_INPUT_NUM (1) #define _CPU_INPUT_NUM (1)
@ -42,6 +43,7 @@
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) #define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) #define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC") #define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC")
#define _KERNEL_NAME_U8 ("com.vivantecorp.extension.Softmax2VXC_u8")
#define SCALAR_INPUT_AXIS (2) #define SCALAR_INPUT_AXIS (2)
@ -64,7 +66,11 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
{ {
vsi_status status = VSI_FAILURE; vsi_status status = VSI_FAILURE;
int sf_size = 0; int sf_size = 0;
vsi_nn_kernel_tensor_attr_t* attr = NULL; vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
float srcZP = 0.0f;
float srcScale = 1.0f;
float dstZP = 0.0f;
float dstScale = 1.0f;
// Alignment with a power of two value. // Alignment with a power of two value.
gpu_param_t gpu_param = { gpu_param_t gpu_param = {
2, // workdim 2, // workdim
@ -75,14 +81,19 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
VSI_UNREFERENCED(param_size); VSI_UNREFERENCED(param_size);
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); attr[0] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
if (!attr) attr[1] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
if ((!attr[0]) || (!attr[1]))
{ {
VSILOGE("Query failure! at line"); VSILOGE("Query failure! at line");
return status; return status;
} }
sf_size = (int)attr->shape->data[0]; sf_size = (int)attr[0]->shape->data[0];
srcScale = attr[0]->scale;
srcZP = (float)attr[0]->zero_point;
dstScale = 1.0f / attr[1]->scale;
dstZP = (float)attr[1]->zero_point;
gpu_param.global_offset[0] = 0; gpu_param.global_offset[0] = 0;
gpu_param.global_offset[1] = 0; gpu_param.global_offset[1] = 0;
@ -91,7 +102,7 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
gpu_param.local_size[0] = 1; gpu_param.local_size[0] = 1;
gpu_param.local_size[1] = 1; gpu_param.local_size[1] = 1;
gpu_param.global_size[0] = gpu_param.global_size[0] =
gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], gpu_align_p2((attr[0]->shape->data[1] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
gpu_param.local_size[0]); gpu_param.local_size[0]);
gpu_param.global_size[1] = gpu_param.global_size[1] =
gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1], gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
@ -107,25 +118,50 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16}; }, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtract8Bin_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, status = vsi_nn_kernel_gpu_add_param( node,
"Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 ); "Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
vsi_nn_kernel_gpu_add_param(node, status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtract8Bin_2x8", &uniExtract8Bin_2x8 );
status |= vsi_nn_kernel_gpu_add_param(node,
"sf_size", &sf_size); "sf_size", &sf_size);
status |= vsi_nn_kernel_gpu_add_param(node, "srcScale", &srcScale);
status |= vsi_nn_kernel_gpu_add_param(node, "srcZP", &srcZP);
status |= vsi_nn_kernel_gpu_add_param(node, "dstScale", &dstScale);
status |= vsi_nn_kernel_gpu_add_param(node, "dstZP", &dstZP);
} }
status = vsi_nn_kernel_gpu_config( node, &gpu_param ); status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
if(status != VSI_SUCCESS) if(status != VSI_SUCCESS)
{ {
VSILOGE("Initializer failure!"); VSILOGE("Initializer failure!");
} }
if (attr) vsi_nn_kernel_tensor_attr_release( &attr ); if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status; return status;
} }
static const vx_kernel_description_t _kernel_info = static const vx_kernel_description_t _kernel_info1 =
{ {
KERNEL_ID_PLACEHOLDER, KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME, _KERNEL_NAME,
@ -139,6 +175,20 @@ static const vx_kernel_description_t _kernel_info =
vsi_nn_KernelDeinitializer vsi_nn_KernelDeinitializer
}; };
static const vx_kernel_description_t _kernel_info2 =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME_U8,
NULL,
kernel_param_def,
_cnt_of_array( kernel_param_def ),
vsi_nn_KernelValidator,
NULL,
NULL,
_softmax_initializer,
vsi_nn_KernelDeinitializer
};
static vsi_status _query_kernel static vsi_status _query_kernel
( (
vsi_nn_tensor_t* const* const inputs, vsi_nn_tensor_t* const* const inputs,
@ -146,9 +196,20 @@ static vsi_status _query_kernel
vsi_nn_kernel_t* kernel vsi_nn_kernel_t* kernel
) )
{ {
VSI_UNREFERENCED(inputs); vsi_nn_kernel_dtype_e in_dtype;
VSI_UNREFERENCED(outputs); vsi_nn_kernel_dtype_e out_dtype;
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
in_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
if (in_dtype == U8 && out_dtype == U8)
{
memmove( &kernel->info, &_kernel_info2, sizeof(vx_kernel_description_t) );
}
else
{
memmove( &kernel->info, &_kernel_info1, sizeof(vx_kernel_description_t) );
}
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header", "vsi_nn_kernel_header",
@ -173,12 +234,42 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0; int32_t axis = 0;
vsi_nn_tensor_t* reshape_tensors[2] = {NULL};
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
uint32_t rank_in = 0;
int32_t new_axis = 0;
uint32_t i = 0;
vsi_bool ret = vx_false_e;
VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
axis = vsi_nn_kernel_param_get_int32(params, "axis"); axis = vsi_nn_kernel_param_get_int32(params, "axis");
ret = vsi_nn_kernel_optimize_softmax_shape(inputs[0]->attr.size,
inputs[0]->attr.dim_num,
axis,
shapes[0],
&rank_in,
&new_axis);
if (ret)
{
reshape_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], rank_in);
reshape_tensors[1] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[0], rank_in);
}
else
{
return NULL;
}
if (!vsi_nn_kernel_gpu_check_shape(reshape_tensors[0]->attr.size,
reshape_tensors[0]->attr.dim_num) ||
new_axis > 2)
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel ); status = _query_kernel( inputs, outputs, kernel );
if( VSI_SUCCESS == status) if( VSI_SUCCESS == status)
{ {
@ -187,9 +278,9 @@ static vsi_nn_kernel_node_t _setup
{ {
/* Set inputs and outputs */ /* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); reshape_tensors, _CPU_INPUT_NUM, &reshape_tensors[1], _CPU_OUTPUT_NUM );
backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
graph, I32, &axis ); graph, I32, &new_axis );
/* Pass parameters to node. */ /* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
@ -200,6 +291,11 @@ static vsi_nn_kernel_node_t _setup
status = VSI_FAILURE; status = VSI_FAILURE;
} }
} }
for (i = 0; i < 2; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node; return node;
} /* _setup() */ } /* _setup() */

View File

@ -0,0 +1,227 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_constraint_check.h"
typedef struct _custom_letterbox_local_data_t {
int32_t placeholder;
} custom_letterbox_local_data_t;
/*
Declare number of input and output.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
int32_t my_round(float in)
{
if (in >= 0)
{
return (int)(in + 0.5f);
}
else
{
return (int)(in - 0.5f);
}
}
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_custom_letterbox_param * p;
p = &(self->nn_param.custom_letterbox);
int32_t shape_w = (int32_t)inputs[0]->attr.size[1];
int32_t shape_h = (int32_t)inputs[0]->attr.size[2];
int32_t new_shape_w = (int32_t)outputs[0]->attr.size[0];
int32_t new_shape_h = (int32_t)outputs[0]->attr.size[1];
vx_bool auto_bool = p->auto_bool;
vx_bool scaleFill = p->scaleFill;
vx_bool scaleup = p->scaleup;
int32_t stride = p->stride;
vx_bool center = p->center;
float r = 1.0f;
int32_t new_unpad_w = 0;
int32_t new_unpad_h = 0;
int32_t dw = 0;
int32_t dh = 0;
int32_t top = 0;
int32_t bottom = 0;
int32_t left = 0;
int32_t right = 0;
r = (float)fmin((float)new_shape_w / shape_w, (float)new_shape_h / shape_h);
if (!scaleup)
{
r = (float)fmin(r, 1.0f);
}
new_unpad_w = my_round(r * shape_w);
new_unpad_h = my_round(r * shape_h);
dw = new_shape_w - new_unpad_w;
dh = new_shape_h - new_unpad_h;
if (auto_bool)
{
dw = dw % stride;
dh = dh % stride;
}
else if (scaleFill)
{
dw = 0;
dh = 0;
new_unpad_w = new_shape_w;
new_unpad_h = new_shape_h;
}
if (center)
{
top = my_round(dh / 2.0f - 0.1f);
bottom = my_round(dh / 2.0f + 0.1f);
left = my_round(dw / 2.0f - 0.1f);
right = my_round(dw / 2.0f + 0.1f);
}
else
{
top = 0;
bottom = my_round(dh + 0.1f);
left = 0;
right = my_round(dw + 0.1f);
}
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "top", top);
vsi_nn_kernel_param_add_int32( param, "bottom", bottom);
vsi_nn_kernel_param_add_int32( param, "left", left);
vsi_nn_kernel_param_add_int32( param, "right", right);
vsi_nn_kernel_param_add_float32( param, "mean_r", p->mean_r);
vsi_nn_kernel_param_add_float32( param, "mean_g", p->mean_g);
vsi_nn_kernel_param_add_float32( param, "mean_b", p->mean_b);
vsi_nn_kernel_param_add_float32( param, "scale_r", p->scale_r);
vsi_nn_kernel_param_add_float32( param, "scale_g", p->scale_g);
vsi_nn_kernel_param_add_float32( param, "scale_b", p->scale_b);
vsi_nn_kernel_param_add_int32( param, "pad_value_r", p->pad_value_r);
vsi_nn_kernel_param_add_int32( param, "pad_value_g", p->pad_value_g);
vsi_nn_kernel_param_add_int32( param, "pad_value_b", p->pad_value_b);
vsi_nn_kernel_param_add_int32( param, "reverse_channel", p->reverse_channel);
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"custom_letterbox",
inputs, 1,
outputs, 1, param );
vsi_nn_kernel_param_release( &param );
return VSI_SUCCESS;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
BEGIN_IO_TYPE_DECL(LETTERBOX, 1, 1)
IO_TYPE(D_U8, D_F16)
IO_TYPE(D_U8, D_U8|Q_ASYM)
IO_TYPE(D_U8, D_I8|Q_DFP)
IO_TYPE(D_U8, D_I8|Q_ASYM)
IO_TYPE(D_U8, D_I8|Q_SYM)
END_IO_TYPE_DECL(LETTERBOX)
if (!VALIDATE_OP_IO_TYPES(LETTERBOX, self, inputs, self->input.num, outputs, self->output.num)) {
char* desc = generate_op_io_types_desc(inputs,
self->input.num, outputs, self->output.num);
VSILOGE("Inputs/Outputs data type not support: %s", desc);
destroy_op_io_types_desc(desc);
return FALSE;
}
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
outputs[0]->attr.size[0] = self->nn_param.custom_letterbox.new_shape_w;
outputs[0]->attr.size[1] = self->nn_param.custom_letterbox.new_shape_h;
outputs[0]->attr.size[2] = 3;
outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
}
return TRUE;
} /* op_setup() */
static vsi_status op_deinit
(
vsi_nn_node_t* self
)
{
vsi_status status = VSI_SUCCESS;
status = vsi_nn_op_common_deinit(self);
return status;
} /* op_deinit() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ CUSTOM_LETTERBOX,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ op_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -85,18 +85,24 @@ static const struct {
HASH_CUMSUM_KERNELS(0, U8, U8) HASH_CUMSUM_KERNELS(0, U8, U8)
HASH_CUMSUM_KERNELS(0, F32, F32) HASH_CUMSUM_KERNELS(0, F32, F32)
HASH_CUMSUM_KERNELS(0, F32, U8) HASH_CUMSUM_KERNELS(0, F32, U8)
HASH_CUMSUM_KERNELS(0, I32, I32)
HASH_CUMSUM_KERNELS(1, U8, U8) HASH_CUMSUM_KERNELS(1, U8, U8)
HASH_CUMSUM_KERNELS(1, F32, F32) HASH_CUMSUM_KERNELS(1, F32, F32)
HASH_CUMSUM_KERNELS(1, F32, U8) HASH_CUMSUM_KERNELS(1, F32, U8)
HASH_CUMSUM_KERNELS(1, I32, I32)
HASH_CUMSUM_KERNELS(2, U8, U8) HASH_CUMSUM_KERNELS(2, U8, U8)
HASH_CUMSUM_KERNELS(2, F32, F32) HASH_CUMSUM_KERNELS(2, F32, F32)
HASH_CUMSUM_KERNELS(2, F32, U8) HASH_CUMSUM_KERNELS(2, F32, U8)
HASH_CUMSUM_KERNELS(2, I32, I32)
HASH_CUMSUM_KERNELS_2D(0, U8, U8) HASH_CUMSUM_KERNELS_2D(0, U8, U8)
HASH_CUMSUM_KERNELS_2D(0, F32, F32) HASH_CUMSUM_KERNELS_2D(0, F32, F32)
HASH_CUMSUM_KERNELS_2D(0, F32, U8) HASH_CUMSUM_KERNELS_2D(0, F32, U8)
HASH_CUMSUM_KERNELS_2D(0, I32, I32)
HASH_CUMSUM_KERNELS_2D(1, U8, U8) HASH_CUMSUM_KERNELS_2D(1, U8, U8)
HASH_CUMSUM_KERNELS_2D(1, F32, F32) HASH_CUMSUM_KERNELS_2D(1, F32, F32)
HASH_CUMSUM_KERNELS_2D(1, F32, U8) HASH_CUMSUM_KERNELS_2D(1, F32, U8)
HASH_CUMSUM_KERNELS_2D(1, I32, I32)
HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_3) HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_3)
HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3) HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)

View File

@ -26,6 +26,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h" #include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
@ -644,7 +645,8 @@ static vsi_nn_kernel_node_t _setup
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
shader_cnt_support = shader_cnt_support =
(graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE; (((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >= 64 &&
((vsi_nn_graph_prv_t*)graph)->options->config.use_40bits_va) ? TRUE : FALSE;
#endif #endif
if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support) if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
{ {

View File

@ -75,6 +75,7 @@ static const _kernel_map_type _one_hot_kernel_map[] =
PACK_ONE_HOT_KERNEL_MAP( F32, F32 ), PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
PACK_ONE_HOT_KERNEL_MAP( I32, I32 ), PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
PACK_ONE_HOT_KERNEL_MAP( I32, F32 ), PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
PACK_ONE_HOT_KERNEL_MAP( I32, BF16 ),
PACK_ONE_HOT_KERNEL_MAP( I32, U8 ), PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
PACK_ONE_HOT_KERNEL_MAP( U8, U8 ), PACK_ONE_HOT_KERNEL_MAP( U8, U8 ),
}; };

View File

@ -0,0 +1,329 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum
{
INTERNAL_KERNEL_ROPE,
} _internal_kernel_e;
#define _ROPE_KERNEL_SOURCE "rope"
#define _ROPE_KERNEL_NAME CVIVANTE_NAMESPACE("cl.rope")
// Add kernel hashtable here
#define STR(a) #a
#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
((IN0_DTYPE) | (IN0_DTYPE << 8) | (OUT_DTYPE << 16) | (AXIS << 25))
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ), \
CVIVANTE_NAMESPACE("cl.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_axis"STR(AXIS)), \
"rope_0" }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _rope_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( F32, F32, F32, 0 ),
PACK_KERNEL_MAP( F32, F32, F32, 1 ),
PACK_KERNEL_MAP( F32, F32, F32, 2 ),
PACK_KERNEL_MAP( I32, I32, I32, 0 ),
PACK_KERNEL_MAP( I32, I32, I32, 1 ),
PACK_KERNEL_MAP( I32, I32, I32, 2 ),
PACK_KERNEL_MAP( U32, U32, U32, 0 ),
PACK_KERNEL_MAP( U32, U32, U32, 1 ),
PACK_KERNEL_MAP( U32, U32, U32, 2 ),
};
/*
* Kernel params
*/
static vx_param_description_t _rope_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def )
#define SCALAR_AXIS (4)
#define SCALAR_IN_ZP (5)
#define SCALAR_COS_ZP (6)
#define SCALAR_SIN_ZP (7)
#define SCALAR_SCALE0 (8)
#define SCALAR_SCALE1 (9)
#define SCALAR_OUT_ZP (10)
#define SCALAR_HALF_HEAD_SIZE (11)
#define SCALAR_STEP (12)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_rope_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0} // globalWorkSize: image size in thread
};
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_attr_t* attr[2] = { NULL };
int32_t axis = 0;
vsi_size_array_t* out_shape = NULL;
vsi_size_t shape[3] = { 1 };
VSI_UNREFERENCED(node);
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
CHECK_STATUS_FAIL_GOTO(status, final);
out_shape = attr[1]->shape;
shape[0] = out_shape->data[0];
shape[1] = out_shape->data[1];
shape[2] = out_shape->data[2];
shape[axis] = shape[axis] / 2;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = shape[0];
gpu_param.global_size[1] = shape[1];
gpu_param.global_size[2] = out_shape->size > 2 ? shape[2] : 1;
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(attr[0]);
SAFE_FREE_TENSOR_ATTR(attr[1]);
#undef SAFE_FREE_TENSOR_ATTR
return status;
} /* _rope_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t axis
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in0_dtype;
vsi_nn_kernel_dtype_e in1_dtype;
vsi_nn_kernel_dtype_e in2_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _rope_kernel_map;
size_t kernel_map_size = _cnt_of_array( _rope_kernel_map );
vx_param_description_t * param_def = _rope_kernel_param_def;
vx_kernel_initialize_f initializer = _rope_initializer;
uint32_t key = 0;
uint32_t i;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
in2_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
#define _PACK_SELECT_KEY( in0_type, in1_type, in2_type, out_type ) \
((in0_type) | (in1_type << 8) | (in2_type << 16) | (out_type << 24))
switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype))
{
case _PACK_SELECT_KEY(F32, F32, F32, F32):
case _PACK_SELECT_KEY(F16, F16, F16, F16):
key = ROPE_HASH_KEY(F32, F32, F32, axis);
break;
case _PACK_SELECT_KEY(U8, U8, U8, U8):
case _PACK_SELECT_KEY(U16, U16, U16, U16):
key = ROPE_HASH_KEY(U32, U32, U32, axis);
break;
case _PACK_SELECT_KEY(I8, I8, I8, I8):
case _PACK_SELECT_KEY(I16, I16, I16, I16):
case _PACK_SELECT_KEY(I32, I32, I32, I32):
key = ROPE_HASH_KEY(I32, I32, I32, axis);
break;
default:
break;
}
#undef _PACK_SELECT_KEY
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
int32_t interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
float in_scale = vsi_nn_get_tensor_scale(inputs[0]);
float cos_scale = vsi_nn_get_tensor_scale(inputs[1]);
float sin_scale = vsi_nn_get_tensor_scale(inputs[2]);
float out_scale = vsi_nn_get_tensor_scale(outputs[0]);
float in_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float cos_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
float sin_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
int32_t half_head_size = interleaved ? 1 : (int32_t)(inputs[0]->attr.size[axis] / 2);
float scale0 = in_scale * cos_scale / out_scale;
float scale1 = in_scale * sin_scale / out_scale;
int32_t step = interleaved ? 2 : 1;
int32_t i = 0;
// Check if gpu can support the size
if ( !vsi_nn_kernel_gpu_check_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs, axis );
if (VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
inputs, input_num, outputs, output_num );
/* Pass parameters to node. */
node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
graph, I32, &axis);
node_params[SCALAR_IN_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &in_zp);
node_params[SCALAR_COS_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &cos_zp);
node_params[SCALAR_SIN_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &sin_zp);
node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create(
graph, F32, &scale0);
node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create(
graph, F32, &scale1);
node_params[SCALAR_OUT_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &output_zp);
node_params[SCALAR_HALF_HEAD_SIZE] = vsi_nn_kernel_scalar_create(
graph, I32, &half_head_size);
node_params[SCALAR_STEP] = vsi_nn_kernel_scalar_create(
graph, I32, &step);
status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
}
}
for (i = SCALAR_AXIS; i < (int32_t)_ROPE_PARAM_NUM; i++)
{
if (node_params[i])
{
vsi_nn_kernel_scalar_release(&node_params[i]);
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( rope, _setup )

View File

@ -27,6 +27,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h" #include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
@ -299,7 +300,7 @@ static vsi_nn_kernel_node_t _setup
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
#if (VX_ACTIVATION_EXT_SUPPORT) #if (VX_ACTIVATION_EXT_SUPPORT)
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
{ {
return NULL; return NULL;
} }

View File

@ -26,6 +26,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h" #include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
@ -457,7 +458,7 @@ static vsi_nn_kernel_node_t _setup
vsi_bool is_odd_even_sort = FALSE; vsi_bool is_odd_even_sort = FALSE;
vsi_bool is_bitnoic_segment = FALSE; vsi_bool is_bitnoic_segment = FALSE;
size_t param_num = _TOPK_PARAM_NUM; size_t param_num = _TOPK_PARAM_NUM;
int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2); int32_t max_stages = 7 + (int32_t)log2(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >> 2);
vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -483,6 +484,11 @@ static vsi_nn_kernel_node_t _setup
return NULL; return NULL;
} }
if (block_size >= GPU_TENSOR_MAX_WIDTH)
{
return NULL;
}
shape[0][0] = block_size; shape[0][0] = block_size;
shape[0][1] = block_num; shape[0][1] = block_num;
shape[1][0] = top_k; shape[1][0] = top_k;

View File

@ -27,6 +27,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h" #include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
@ -192,7 +193,7 @@ static vsi_bool _bucketize_support_types
return FALSE; return FALSE;
} }
if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2) if (in_dtype == F16 && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver != VSI_NN_HW_EVIS_2)
{ {
return FALSE; return FALSE;
} }

View File

@ -27,6 +27,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h" #include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
@ -771,7 +772,8 @@ static vsi_nn_kernel_node_t _setup
temp_tensor[1] = weights; temp_tensor[1] = weights;
temp_tensor[2] = biases; temp_tensor[2] = biases;
ks = get_kernel_size(weights->attr.size[0], dilation, stride, graph->ctx->config.evis.ver); ks = get_kernel_size(weights->attr.size[0], dilation, stride,
((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver);
status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks); status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);

View File

@ -121,7 +121,9 @@ static const _kernel_map_type _groupnorm_sums_kernel_map[] =
TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 ) TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 )
TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 ) TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 )
TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS( U16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS_2D( U16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
}; };
@ -174,6 +176,9 @@ static const _kernel_map_type _groupnorm_kernel_map[] =
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 ) TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SCALE_KERNELS( U16, F32, U16, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SCALE_KERNELS_2D( U16, F32, U16, KERNEL_SOURCE_2 )
}; };
/* /*
@ -245,6 +250,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
float sum_x2_tail0 = 1; float sum_x2_tail0 = 1;
float sum_x2_tail1 = 1; float sum_x2_tail1 = 1;
float work_item_pixels = 1; float work_item_pixels = 1;
vsi_bool is_input_8bits = FALSE;
VSI_UNREFERENCED(param_size); VSI_UNREFERENCED(param_size);
@ -263,12 +269,13 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
width = (int32_t)(input_shape->data[0]); width = (int32_t)(input_shape->data[0]);
height = (int32_t)(input_shape->data[1]); height = (int32_t)(input_shape->data[1]);
chn = (int32_t)(attr[1]->shape->data[1]); chn = (int32_t)(attr[1]->shape->data[1]);
is_input_8bits = attr[0]->dtype == I8 || attr[0]->dtype == U8;
if (is2D) if (is2D)
{ {
height = 1; height = 1;
} }
work_item_pixels = (float)height * 16; work_item_pixels = is_input_8bits ? 16 * (float)height : 8 * (float)height;
sum_x_tail = -work_item_pixels * input_zp * input_scale; sum_x_tail = -work_item_pixels * input_zp * input_scale;
sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2; sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
@ -281,11 +288,11 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
shaderParam.local_size[1] = 1; shaderParam.local_size[1] = 1;
shaderParam.local_size[2] = 1; shaderParam.local_size[2] = 1;
if (attr[0]->dtype == I8 || attr[0]->dtype == U8) if (is_input_8bits)
{ {
shaderParam.global_size[0] = (width + 255) / 256 * 16; shaderParam.global_size[0] = (width + 255) / 256 * 16;
} }
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
{ {
shaderParam.global_size[0] = (width + 127) / 128 * 16; shaderParam.global_size[0] = (width + 127) / 128 * 16;
} }
@ -324,7 +331,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1); status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
CHECK_STATUS_FAIL_GOTO(status, OnError ); CHECK_STATUS_FAIL_GOTO(status, OnError );
} }
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
{ {
gpu_dp_inst_t uniSum_X_X2_8x2 = {{ gpu_dp_inst_t uniSum_X_X2_8x2 = {{
0x55555555, // TCfg 0x55555555, // TCfg
@ -483,7 +490,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
} }
shaderParam.global_scale[0] = 16; shaderParam.global_scale[0] = 16;
if (attr[0]->dtype == I16 || attr[0]->dtype == F16) if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
{ {
shaderParam.global_scale[0] = 8; shaderParam.global_scale[0] = 8;
} }
@ -610,6 +617,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError ); CHECK_STATUS_FAIL_GOTO(status, OnError );
} }
break; break;
case _PACK_SELECT_KEY( U16, U16 ):
case _PACK_SELECT_KEY( I16, I16 ): case _PACK_SELECT_KEY( I16, I16 ):
case _PACK_SELECT_KEY( I16, F16 ): case _PACK_SELECT_KEY( I16, F16 ):
case _PACK_SELECT_KEY( F16, F16 ): case _PACK_SELECT_KEY( F16, F16 ):
@ -838,8 +846,7 @@ static vsi_nn_kernel_node_t _setup
attr.is_const = FALSE; attr.is_const = FALSE;
attr.vtl = TRUE; attr.vtl = TRUE;
attr.size[0] = ((new_shape[0] + 255) / 256) * 4; attr.size[0] = ((new_shape[0] + 255) / 256) * 4;
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 if (in0_dtype == I16 || in0_dtype == F16 || in0_dtype == U16)
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
{ {
attr.size[0] = ((new_shape[0] + 127) / 128) * 4; attr.size[0] = ((new_shape[0] + 127) / 128) * 4;
} }

View File

@ -124,15 +124,16 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
{0, 0, 0} {0, 0, 0}
}; };
int8_t in0_fl = 0; int8_t in0_fl = 0;
int32_t inputZP0 = 0; int32_t input0_zp = 0;
float input_scale0 = 1.0f; float input0_scale = 1.0f;
int32_t inputZP1 = 0; int32_t input1_zp = 0;
float input_scale1 = 1.0f; float input1_scale = 1.0f;
float output_zp = 0;
int8_t out_fl = 0; int8_t out_fl = 0;
float outputZP = 0;
int32_t shift0 = 0; int32_t shift0 = 0;
vsi_bool is_ge_fl = FALSE; vsi_bool is_ge_fl = FALSE;
vsi_bool is_2d_img = FALSE; vsi_bool is_2d_img = FALSE;
uint32_t evis_version = 0; uint32_t evis_version = 0;
@ -165,27 +166,23 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
out_shape = attr[2]->shape; out_shape = attr[2]->shape;
inputZP0 = attr[0]->zero_point; input0_zp = attr[0]->zero_point;
input_scale0 = attr[0]->scale; input0_scale = attr[0]->scale;
inputZP1 = attr[1]->zero_point; input1_zp = attr[1]->zero_point;
input_scale1 = attr[1]->scale; input1_scale = attr[1]->scale;
outputZP = (float)attr[2]->zero_point; output_zp = (float)attr[2]->zero_point;
input_scale0 = input_scale0 / attr[2]->scale; input0_scale = input0_scale / attr[2]->scale;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP &&
attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
{ {
in0_fl = (int8_t)attr[0]->dfp.fl; in0_fl = (int8_t)attr[0]->dfp.fl;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
out_fl = (int8_t)attr[2]->dfp.fl; out_fl = (int8_t)attr[2]->dfp.fl;
}
shift0 = in0_fl - out_fl; shift0 = in0_fl - out_fl;
is_ge_fl = shift0 >= 0;
}
is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1); is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
is_ge_fl = shift0 >= 0;
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 ) \ #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 ) \
(IN0_TYPE | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26)) (IN0_TYPE | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26))
@ -204,7 +201,6 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
gpu_param.global_scale[1] = 1; gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1; gpu_param.global_scale[2] = 1;
} }
gpu_param.global_size[0] = gpu_align_p2( gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1) (out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4); / gpu_param.global_scale[0], 4);
@ -302,10 +298,10 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
CHECK_STATUS_FAIL_GOTO(status, final); CHECK_STATUS_FAIL_GOTO(status, final);
} }
break; break;
case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 1 ): case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 1):
case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 2 ): case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 2):
case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 1 ): case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 1):
case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 2 ): case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 2):
{ {
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg 0x11111111, // TCfg
@ -446,15 +442,15 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 ); "uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,
"inputZP0", &inputZP0 ); "input0_zp", &input0_zp);
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,
"input_scale0", &input_scale0 ); "input0_scale", &input0_scale );
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,
"inputZP1", &inputZP1 ); "input1_zp", &input1_zp);
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,
"input_scale1", &input_scale1 ); "input1_scale", &input1_scale );
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,
"outputZP", &outputZP ); "output_zp", &output_zp );
if (attr[2]->dtype == F16) if (attr[2]->dtype == F16)
{ {
status |= vsi_nn_kernel_gpu_add_param( node, status |= vsi_nn_kernel_gpu_add_param( node,

View File

@ -27,6 +27,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h" #include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
@ -58,53 +59,92 @@ typedef enum
#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt" #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_1" #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_1"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_2" #define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_2"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_3"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_4"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_5"
#define STR(a) #a #define STR(a) #a
// Add kernel hashtable here // Add kernel hashtable here
#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \ #define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag, same_type ) \
(( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag)) (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (scale_flag << 16) | (same_type << 22))
#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \ #define _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \ { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN, SAME_TYPE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) } _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \ #define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \ _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, TRUE ), \
_PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, FALSE )
#define _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP, SAME_TYPE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) } _RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \ #define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \ _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, TRUE ), \
_PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, FALSE )
#define _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT, SAME_TYPE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) } _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \ #define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \ _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, TRUE ), \
_PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, FALSE )
#define PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_2x_upsample_half_pixel_centers"), \ "_SAME_2x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \ #define PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \ { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_4x_upsample_half_pixel_centers"), \ "_SAME_4x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \ #define PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \ { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_8x_upsample_half_pixel_centers"), \ "_SAME_8x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) } _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \ #define PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \ { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_3x_upsample_half_pixel_centers"), \ "_SAME_3x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, FALSE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_2x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, FALSE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_4x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, FALSE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_8x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, FALSE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_3x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \ #define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \ { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_8x_upsample_align_corners"), \ "_SAME_8x_upsample_align_corners"), \
"resize_bilinear_align_corners" } "resize_bilinear_align_corners" }
@ -135,6 +175,10 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
PACK_KERNEL_MAP_UP(F16, F16), PACK_KERNEL_MAP_UP(F16, F16),
PACK_KERNEL_MAP_UP(BF16, BF16), PACK_KERNEL_MAP_UP(BF16, BF16),
PACK_KERNEL_MAP_UP_OPT(U8, U8), PACK_KERNEL_MAP_UP_OPT(U8, U8),
PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE(U8, U8),
PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE(U8, U8),
PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE(U8, U8),
PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE(U8, U8),
PACK_KERNEL_MAP_UP_2X_HALF(U8, U8), PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8), PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8), PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
@ -675,15 +719,20 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
vsi_size_array_t * out_shape = NULL; vsi_size_array_t * out_shape = NULL;
vsi_size_array_t * in_shape = NULL; vsi_size_array_t * in_shape = NULL;
vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e input_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = F16;
uint32_t depth = 0; uint32_t depth = 0;
uint32_t in_width = 0; uint32_t in_width = 0;
uint32_t in_height = 0; uint32_t in_height = 0;
uint32_t out_width = 0; uint32_t out_width = 0;
uint32_t out_height = 0; uint32_t out_height = 0;
vsi_bool is_same_type = FALSE;
vsi_bool is_2x_up_kernel = FALSE; vsi_bool is_2x_up_kernel = FALSE;
vsi_bool is_3x_up_kernel = FALSE; vsi_bool is_3x_up_kernel = FALSE;
vsi_bool is_4x_up_kernel = FALSE; vsi_bool is_4x_up_kernel = FALSE;
vsi_bool is_8x_up_kernel = FALSE; vsi_bool is_8x_up_kernel = FALSE;
float scale = 1.f;
int32_t input_zp = 0;
int32_t output_zp = 0;
VSI_UNREFERENCED(param_size); VSI_UNREFERENCED(param_size);
@ -695,14 +744,20 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
out_shape = output_attr->shape; out_shape = output_attr->shape;
in_shape = input_attr->shape; in_shape = input_attr->shape;
input_dtype = input_attr->dtype; input_dtype = input_attr->dtype;
output_dtype = output_attr->dtype;
in_width = (uint32_t)(in_shape->data[0]); in_width = (uint32_t)(in_shape->data[0]);
in_height = (uint32_t)(in_shape->data[1]); in_height = (uint32_t)(in_shape->data[1]);
depth = (uint32_t)(in_shape->data[2]); depth = (uint32_t)(in_shape->data[2]);
out_width = (uint32_t)(out_shape->data[0]); out_width = (uint32_t)(out_shape->data[0]);
out_height = (uint32_t)(out_shape->data[1]); out_height = (uint32_t)(out_shape->data[1]);
scale = input_attr->scale;
input_zp = input_attr->zero_point;
scale /= output_attr->scale;
output_zp = output_attr->zero_point;
is_same_type = _is_same_quant(input_attr, output_attr);
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr))) if ((U8 == input_dtype) && (output_dtype == U8))
{ {
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height); is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height); is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
@ -728,6 +783,22 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
gpu_param.global_scale[2] = 1; gpu_param.global_scale[2] = 1;
} }
if (is_2x_up_kernel || is_3x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
{
uint16_t M0 = 0;
int32_t postShift = 0;
uint32_t multAndoutZP[2] = { 0 };
gpu_dp_inst_t uniU8PostProcess_2x8 = { {
0xdddddddd, // TCfg
0x44444444, // ASelt
0x13121110, 0x17161514, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (is_2x_up_kernel) if (is_2x_up_kernel)
{ {
gpu_dp_inst_t uniResize2xUp_0_4x8 = { { gpu_dp_inst_t uniResize2xUp_0_4x8 = { {
@ -745,6 +816,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
if (!is_same_type)
{
float f2i_radio = 16.0f;
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
multAndoutZP[0] = (uint32_t)(M0);
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
uniResize2xUp_0_4x8.data[7] = 0x00000700;
uniResize2xUp_1_4x8.data[7] = 0x00000700;
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
&uniU8PostProcess_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8); status = vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height); status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
@ -813,6 +901,27 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant 0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
if (!is_same_type)
{
float f2i_radio = 256.0f;
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
multAndoutZP[0] = (uint32_t)(M0);
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
uniResize3xUp_l00_2x8.data[7] = 0x00000608;
uniResize3xUp_l01_2x8.data[7] = 0x00000608;
uniResize3xUp_l10_4x4.data[7] = 0x00000607;
uniResize3xUp_l11_4x4.data[7] = 0x00000607;
uniResize3xUp_l12_4x4.data[7] = 0x00000607;
uniResize3xUp_l13_4x4.data[7] = 0x00000607;
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
&uniU8PostProcess_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8); status = vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
@ -852,6 +961,25 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
if (!is_same_type)
{
float f2i_radio = 64.0f;
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
multAndoutZP[0] = (uint32_t)(M0);
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
uniResize4xUp_l00_4x8.data[7] = 0x00000400;
uniResize4xUp_l01_4x8.data[7] = 0x00000400;
uniResize4xUp_l10_4x8.data[7] = 0x00000400;
uniResize4xUp_l11_4x8.data[7] = 0x00000400;
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
&uniU8PostProcess_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8); status = vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
@ -918,6 +1046,29 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16 }; }, GPU_DP_TYPE_16 };
if (!is_same_type)
{
float f2i_radio = 256.0f;
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
multAndoutZP[0] = (uint32_t)(M0);
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
uniResize8xUp_l00_4x8.data[7] = 0x00000700;
uniResize8xUp_l01_4x8.data[7] = 0x00000700;
uniResize8xUp_l10_4x8.data[7] = 0x00000700;
uniResize8xUp_l11_4x8.data[7] = 0x00000700;
uniResize8xUp_l20_4x8.data[7] = 0x00000700;
uniResize8xUp_l21_4x8.data[7] = 0x00000700;
uniResize8xUp_l30_4x8.data[7] = 0x00000700;
uniResize8xUp_l31_4x8.data[7] = 0x00000700;
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
&uniU8PostProcess_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8); status = vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
@ -929,6 +1080,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height); status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final); CHECK_STATUS_FAIL_GOTO(status, final);
} }
}
else else
{ {
VSILOGE("input or output's format is not support"); VSILOGE("input or output's format is not support");
@ -1193,22 +1345,22 @@ static vsi_status _query_kernel
if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0]) if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
{ {
if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample) if ((!align_corners) && (half_pixel_centers) && is_2x_upsample)
{ {
scale_flag = UP_2X_HALF; scale_flag = UP_2X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer; initializer = _bilinear_half_pixel_centers_opt_initializer;
} }
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample) else if ((!align_corners) && (half_pixel_centers) && is_3x_upsample)
{ {
scale_flag = UP_3X_HALF; scale_flag = UP_3X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer; initializer = _bilinear_half_pixel_centers_opt_initializer;
} }
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample) else if ((!align_corners) && (half_pixel_centers) && is_4x_upsample)
{ {
scale_flag = UP_4X_HALF; scale_flag = UP_4X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer; initializer = _bilinear_half_pixel_centers_opt_initializer;
} }
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample) else if ((!align_corners) && (half_pixel_centers) && is_8x_upsample)
{ {
scale_flag = UP_8X_HALF; scale_flag = UP_8X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer; initializer = _bilinear_half_pixel_centers_opt_initializer;
@ -1232,7 +1384,7 @@ static vsi_status _query_kernel
scale_flag = DOWN; scale_flag = DOWN;
} }
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{ {
if( kernel_map[i].key == key ) if( kernel_map[i].key == key )
@ -1244,7 +1396,7 @@ static vsi_status _query_kernel
if ((UP_OPT == scale_flag) && (i >= kernel_map_size)) if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
{ {
scale_flag = UP; scale_flag = UP;
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{ {
if( kernel_map[i].key == key ) if( kernel_map[i].key == key )
@ -1257,7 +1409,7 @@ static vsi_status _query_kernel
if ((UP == scale_flag) && (i >= kernel_map_size)) if ((UP == scale_flag) && (i >= kernel_map_size))
{ {
scale_flag = DOWN; scale_flag = DOWN;
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{ {
if( kernel_map[i].key == key ) if( kernel_map[i].key == key )
@ -1433,7 +1585,7 @@ static vsi_bool _is_image_width_lt16
size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype); size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8; vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
{ {
return FALSE; return FALSE;
} }
@ -1468,7 +1620,8 @@ static vsi_nn_kernel_node_t _setup
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]); vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]);
vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2); vsi_bool is_evis2 = \
(vsi_bool)(((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_2);
vsi_bool is_run_opt_kernel = FALSE; vsi_bool is_run_opt_kernel = FALSE;
vsi_nn_tensor_t* scale = NULL; vsi_nn_tensor_t* scale = NULL;
int32_t pad_left = half_pixel_centers ? 1 : 0; int32_t pad_left = half_pixel_centers ? 1 : 0;

View File

@ -0,0 +1,744 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
B---batch
N---num_heads
S---sequence length
H---head size
*/
typedef enum
{
LAYOUT_NONE,
LAYOUT_BNHS,
LAYOUT_BNH1,
LAYOUT_BSNH,
LAYOUT_BNSH,
} _internal_rope_layout_e;
// Add kernel hashtable here
#define STR(a) #a
#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT, INTERLEAVED ) \
((IN0_DTYPE) | (IN1_DTYPE << 8) | (OUT_DTYPE << 16) | (LAYOUT << 24) | (INTERLEAVED << 28))
#define PACK_KERNEL_BNHS_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNHS, 0 ), \
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnhs"), \
"rope_0" }
#define PACK_KERNEL_BNH1_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNH1, 0 ), \
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnh1"), \
"rope_1" }
#define PACK_KERNEL_BSNH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BSNH, 1 ), \
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bsnh"), \
"rope_2" }
#define PACK_KERNEL_BNSH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNSH, 1 ), \
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnsh"), \
"rope_3" }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
#define PACK_KERNEL_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
PACK_KERNEL_BNHS_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
PACK_KERNEL_BNH1_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
PACK_KERNEL_BSNH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
PACK_KERNEL_BNSH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE),
static const _kernel_map_type _rope_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( BF16, BF16, BF16)
PACK_KERNEL_MAP( F16, F16, F16 )
PACK_KERNEL_MAP( I16, I16, I16 )
PACK_KERNEL_MAP( I16, F16, I16 )
PACK_KERNEL_MAP( I16, I16, I8 )
PACK_KERNEL_MAP( I16, F16, I8 )
PACK_KERNEL_MAP( I16, I16, U8 )
PACK_KERNEL_MAP( I16, F16, U8 )
PACK_KERNEL_MAP( U16, U16, U16 )
PACK_KERNEL_MAP( U16, F16, U16 )
PACK_KERNEL_MAP( I8, I8, I8 )
PACK_KERNEL_MAP( I8, F16, I8 )
PACK_KERNEL_MAP( U8, U8, U8 )
PACK_KERNEL_MAP( U8, F16, U8 )
};
/*
* Kernel params
*/
static vx_param_description_t _rope_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def )
#define SCALAR_AXIS (4)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_rope_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t* out_attr = NULL;
vsi_nn_kernel_tensor_attr_t* in0_attr = NULL;
vsi_nn_kernel_tensor_attr_t* in1_attr = NULL;
vsi_nn_kernel_tensor_attr_t* in2_attr = NULL;
vsi_size_array_t* in_shape = NULL;
vsi_nn_kernel_dtype_e in0_dtype = F16;
vsi_nn_kernel_dtype_e in1_dtype = F16;
vsi_nn_kernel_dtype_e in2_dtype = F16;
vsi_nn_kernel_dtype_e out_dtype = F16;
float in0_scale = 1.0f;
float in1_scale = 1.0f;
float in2_scale = 1.0f;
float output_scale = 1.0f;
float output_zp = 0;
int32_t in0_zp = 0;
int32_t cos_zp = 0;
int32_t sin_zp = 0;
int32_t p = 0;
int32_t axis = 0;
int32_t interleaved = 0;
int32_t half_head_size = 1;
vsi_size_t shape[3] = {1};
uint32_t pack_key = 0;
VSI_UNREFERENCED(node);
VSI_UNREFERENCED(param);
VSI_UNREFERENCED(param_size);
// Add initializer
in0_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
CHECK_PTR_FAIL_GOTO(in0_attr, "Create tensor attr buffer fail.", final);
in1_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
CHECK_PTR_FAIL_GOTO(in1_attr, "Create tensor attr buffer fail.", final);
in2_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
CHECK_PTR_FAIL_GOTO(in2_attr, "Create tensor attr buffer fail.", final);
out_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[3]);
CHECK_PTR_FAIL_GOTO(out_attr, "Create tensor attr buffer fail.", final);
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &p);
CHECK_STATUS_FAIL_GOTO(status, final);
axis = p & 0xFFFF;
interleaved = (p >> 16) & 0xFFFF;
in_shape = in0_attr->shape;
in0_dtype = in0_attr->dtype;
in1_dtype = in1_attr->dtype;
in2_dtype = in2_attr->dtype;
out_dtype = out_attr->dtype;
in0_scale = in0_attr->scale;
in1_scale = in1_attr->scale;
in2_scale = in2_attr->scale;
in0_zp = -in0_attr->zero_point;
cos_zp = -in1_attr->zero_point;
sin_zp = -in2_attr->zero_point;
output_scale = out_attr->scale;
output_zp = (float)out_attr->zero_point;
half_head_size = (int32_t)(in_shape->data[axis] / 2);
shape[0] = in_shape->data[0];
shape[1] = in_shape->data[1];
shape[2] = in_shape->data[2];
shape[axis] = half_head_size;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((shape[0] + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = shape[1];
gpu_param.global_size[2] = shape[2];
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \
((IN0_TYPE) | (IN1_TYPE << 8) | (IN2_TYPE << 16) | (OUT_TYPE << 24))
pack_key = _PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype);
switch (pack_key)
{
case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = { {
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = { {
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = { {
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
if (interleaved && axis == 0)
{
uniExtractOddData_2x8.data[1] = 0x10101010;
uniExtractOddData_2x8.data[2] = 0x03030101;
uniExtractOddData_2x8.data[3] = 0x07070505;
}
else
{
status = vsi_nn_kernel_gpu_add_param(node,
"half_head_size", &half_head_size);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, final);
}
break;
case _PACK_SELECT_KEY(I16, I16, I16, I16):
case _PACK_SELECT_KEY(I16, F16, F16, I16):
case _PACK_SELECT_KEY(I16, I16, I16, I8):
case _PACK_SELECT_KEY(I16, F16, F16, I8):
case _PACK_SELECT_KEY(I16, I16, I16, U8):
case _PACK_SELECT_KEY(I16, F16, F16, U8):
case _PACK_SELECT_KEY(F16, F16, F16, F16):
{
float scale0 = in0_scale * in1_scale / output_scale;
float scale1 = in0_scale* in2_scale / output_scale;
gpu_dp_inst_t uniExtractHalf8_2x8 = { {
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractInteger_2x8 = { {
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniATimesB_0_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x01010101, // BSelt
0x00010000, 0x00030002, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniATimesB_1_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x01010101, // BSelt
0x00050004, 0x00070006, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAEvenTimesB_0_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x01010101, // BSelt
0x00010000, 0x00030002, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAEvenTimesB_1_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x01010101, // BSelt
0x00050004, 0x00070006, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAOddTimesB_0_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00030001, 0x00070005, // ABin
0x01010101, // BSelt
0x00010000, 0x00030002, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAOddTimesB_1_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00030001, 0x00070005, // ABin
0x01010101, // BSelt
0x00050004, 0x00070006, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (interleaved && axis == 0)
{
uniExtractHalf8_2x8.data[1] = 0x10101010;
uniExtractHalf8_2x8.data[2] = 0x02020000;
uniExtractHalf8_2x8.data[3] = 0x06060404;
uniExtractInteger_2x8.data[1] = 0x10101010;
uniExtractInteger_2x8.data[2] = 0x01010000;
uniExtractInteger_2x8.data[3] = 0x03030202;
status = vsi_nn_kernel_gpu_add_param(node,
"uniAEvenTimesB_0_4x4", &uniAEvenTimesB_0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAEvenTimesB_1_4x4", &uniAEvenTimesB_1_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAOddTimesB_0_4x4", &uniAOddTimesB_0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAOddTimesB_1_4x4", &uniAOddTimesB_1_4x4);
}
else
{
status = vsi_nn_kernel_gpu_add_param(node,
"uniATimesB_0_4x4", &uniATimesB_0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniATimesB_1_4x4", &uniATimesB_1_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"half_head_size", &half_head_size);
}
status |= vsi_nn_kernel_gpu_add_param(node,
"scale0", &scale0);
status |= vsi_nn_kernel_gpu_add_param(node,
"scale1", &scale1);
status |= vsi_nn_kernel_gpu_add_param(node,
"output_zp", &output_zp);
if (out_dtype == F16)
{
status |= vsi_nn_kernel_gpu_add_param(node,
"uniExtract8Data_2x8", &uniExtractHalf8_2x8);
}
else
{
status |= vsi_nn_kernel_gpu_add_param(node,
"uniExtract8Data_2x8", &uniExtractInteger_2x8);
}
CHECK_STATUS_FAIL_GOTO(status, final);
}
break;
case _PACK_SELECT_KEY(I8, I8, I8, I8):
case _PACK_SELECT_KEY(U8, U8, U8, U8):
case _PACK_SELECT_KEY(U16, U16, U16, U16):
case _PACK_SELECT_KEY(I8, F16, F16, I8):
case _PACK_SELECT_KEY(U8, F16, F16, U8):
case _PACK_SELECT_KEY(U16, F16, F16, U16):
{
float scale0 = in0_scale * in1_scale / output_scale;
float scale1 = in0_scale* in2_scale / output_scale;
gpu_dp_inst_t uniExtractInteger_2x8 = { {
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAMinusZp_0_4x4 = { {
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAMinusZp_1_4x4 = { {
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAEvenMinusZp_4x4 = { {
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAOddMinusZp_4x4 = { {
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00030001, 0x00070005, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (interleaved && axis == 0)
{
uniExtractInteger_2x8.data[1] = 0x10101010;
uniExtractInteger_2x8.data[2] = 0x01010000;
uniExtractInteger_2x8.data[3] = 0x03030202;
status = vsi_nn_kernel_gpu_add_param(node,
"uniAEvenMinusZp_4x4", &uniAEvenMinusZp_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAOddMinusZp_4x4", &uniAOddMinusZp_4x4);
}
else
{
status = vsi_nn_kernel_gpu_add_param(node,
"half_head_size", &half_head_size);
}
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAMinusZp_0_4x4", &uniAMinusZp_0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAMinusZp_1_4x4", &uniAMinusZp_1_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"scale0", &scale0);
status |= vsi_nn_kernel_gpu_add_param(node,
"scale1", &scale1);
status |= vsi_nn_kernel_gpu_add_param(node,
"output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node,
"in0_zp", &in0_zp);
status |= vsi_nn_kernel_gpu_add_param(node,
"cos_zp", &cos_zp);
status |= vsi_nn_kernel_gpu_add_param(node,
"sin_zp", &sin_zp);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniExtract8Data_2x8", &uniExtractInteger_2x8);
CHECK_STATUS_FAIL_GOTO(status, final);
}
break;
default:
break;
}
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
final:
if (in0_attr) vsi_nn_kernel_tensor_attr_release(&in0_attr);
if (in1_attr) vsi_nn_kernel_tensor_attr_release(&in1_attr);
if (in2_attr) vsi_nn_kernel_tensor_attr_release(&in2_attr);
if (out_attr) vsi_nn_kernel_tensor_attr_release(&out_attr);
return status;
} /* _rope_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t axis,
int32_t interleaved,
_internal_rope_layout_e *layout
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in0_dtype;
vsi_nn_kernel_dtype_e in1_dtype;
vsi_nn_kernel_dtype_e in2_dtype;
vsi_nn_kernel_dtype_e out_dtype;
int32_t in0_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
int32_t in1_zp = vsi_nn_get_tensor_zero_point(inputs[1]);
int32_t in2_zp = vsi_nn_get_tensor_zero_point(inputs[2]);
const _kernel_map_type * kernel_map = _rope_kernel_map;
size_t kernel_map_size = _cnt_of_array( _rope_kernel_map );
vx_param_description_t * param_def = _rope_kernel_param_def;
vx_kernel_initialize_f initializer = _rope_initializer;
uint32_t key;
uint32_t i;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
/*only support symmetric int16*/
if ( ( (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I16) ||
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == I16) ||
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == I8) ||
(in0_dtype == I16 && in1_dtype == I16 && out_dtype == I8) ||
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == U8) ||
(in0_dtype == I16 && in1_dtype == I16 && out_dtype == U8) ) &&
(in0_zp != 0 || in1_zp != 0 || in2_zp != 0))
{
return VSI_FAILURE;
}
if (axis == 1 && inputs[0]->attr.size[0] == inputs[1]->attr.size[0] &&
in1_dtype == in2_dtype)
{
if (inputs[0]->attr.size[0] == 1)
{
*layout = LAYOUT_BNH1;
}
else
{
*layout = LAYOUT_BNHS;
}
}
else if (axis == 0 && in1_dtype == in2_dtype)
{
if (inputs[0]->attr.size[2] == inputs[1]->attr.size[2] &&
inputs[1]->attr.size[1] == 1)
{
*layout = LAYOUT_BSNH;
}
else
{
*layout = LAYOUT_BNSH;
}
}
key = ROPE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, *layout, interleaved);
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
int32_t i = 0;
int32_t interleaved = 0;
int32_t param = 0;
vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_nn_tensor_t* rs_tensors[4] = { NULL };
vsi_nn_tensor_t* reshape_tensors[4] = { NULL };
_internal_rope_layout_e layout = LAYOUT_NONE;
VSI_UNREFERENCED(params);
axis = vsi_nn_kernel_param_get_int32(params, "axis");
interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
// Check if gpu can support the size
if ( !vsi_nn_kernel_gpu_check_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs, axis, interleaved, &layout );
if (outputs[0]->attr.size[0] == 1 || layout == LAYOUT_BSNH)
{
memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
if (outputs[0]->attr.size[0] == 1)
{
for (i = 1; i < 3; i++)
{
shape[0][i - 1] = shape[0][i];
shape[1][i - 1] = shape[1][i];
shape[2][i - 1] = shape[2][i];
}
shape[0][2] = 1;
shape[1][2] = 1;
shape[2][2] = 1;
}
else
{
int32_t j = 0;
for (i = 0; i < 3; i++)
{
if (shape[1][i] != 1)
{
shape[1][j] = shape[1][i];
j ++;
}
}
for (; j < 3; j++)
{
shape[1][j] = 1;
}
}
rs_tensors[0] = vsi_nn_reshape_tensor(graph,
inputs[0], shape[0], inputs[0]->attr.dim_num);
rs_tensors[1] = vsi_nn_reshape_tensor(graph,
inputs[1], shape[1], inputs[1]->attr.dim_num);
rs_tensors[2] = vsi_nn_reshape_tensor(graph,
inputs[2], shape[1], inputs[2]->attr.dim_num);
rs_tensors[3] = vsi_nn_reshape_tensor(graph,
outputs[0], shape[2], outputs[0]->attr.dim_num);
if (outputs[0]->attr.size[0] == 1 && axis > 0)
{
axis--;
}
reshape_tensors[0] = rs_tensors[0];
reshape_tensors[1] = rs_tensors[1];
reshape_tensors[2] = rs_tensors[2];
reshape_tensors[3] = rs_tensors[3];
}
else
{
reshape_tensors[0] = inputs[0];
reshape_tensors[1] = inputs[1];
reshape_tensors[2] = inputs[2];
reshape_tensors[3] = outputs[0];
}
param = (interleaved << 16) | axis;
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
reshape_tensors, input_num, &reshape_tensors[3], output_num );
/* Pass parameters to node. */
node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &param);
status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
vsi_nn_kernel_scalar_release(&node_params[SCALAR_AXIS]);
}
}
for (i = 0; i < 4; i++)
{
vsi_safe_release_tensor(rs_tensors[i]);
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( rope, _setup )

View File

@ -186,18 +186,26 @@ static const _kernel_map_type scatter_nd_update_special_ref_map[] =
{ {
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4)
}; };
static const _kernel_map_type scatter_nd_update_special_update_map[] = static const _kernel_map_type scatter_nd_update_special_update_map[] =
{ {
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4)
}; };
static const _kernel_map_type scatter_nd_update_special_copy_map[] = static const _kernel_map_type scatter_nd_update_special_copy_map[] =
{ {
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
}; };
/* /*
@ -563,6 +571,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
{ {
case _PACK_SELECT_KEY( I8, I8 ): case _PACK_SELECT_KEY( I8, I8 ):
case _PACK_SELECT_KEY( U8, U8 ): case _PACK_SELECT_KEY( U8, U8 ):
case _PACK_SELECT_KEY( I16, I16 ):
case _PACK_SELECT_KEY( U16, U16 ):
{ {
uint16_t M0 = 0; uint16_t M0 = 0;
int32_t postShift0 = 0; int32_t postShift0 = 0;
@ -605,6 +615,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError ); CHECK_STATUS_FAIL_GOTO(status, OnError );
} }
break; break;
case _PACK_SELECT_KEY( F16, F16 ):
break;
default: default:
break; break;
} }
@ -759,6 +771,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
{ {
case _PACK_SELECT_KEY( I8, I8 ): case _PACK_SELECT_KEY( I8, I8 ):
case _PACK_SELECT_KEY( U8, U8 ): case _PACK_SELECT_KEY( U8, U8 ):
case _PACK_SELECT_KEY( I16, I16 ):
case _PACK_SELECT_KEY( U16, U16 ):
{ {
uint16_t M1 = 0; uint16_t M1 = 0;
int32_t postShift1 = 0; int32_t postShift1 = 0;
@ -801,6 +815,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError ); CHECK_STATUS_FAIL_GOTO(status, OnError );
} }
break; break;
case _PACK_SELECT_KEY( F16, F16 ):
break;
default: default:
break; break;
} }
@ -1597,6 +1613,19 @@ static vsi_status _query_kernel_special
status |= VSI_FAILURE; status |= VSI_FAILURE;
} }
if (input0_dtype == F16)
{
input0_dtype = U16;
}
if (input2_dtype == F16)
{
input2_dtype = U16;
}
if (output_dtype == F16)
{
output_dtype = U16;
}
key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0); key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);
for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ ) for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )

View File

@ -27,6 +27,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h" #include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
@ -591,7 +592,7 @@ static vsi_nn_kernel_node_t _setup
VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
#if (VX_ACTIVATION_EXT_SUPPORT) #if (VX_ACTIVATION_EXT_SUPPORT)
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
{ {
return NULL; return NULL;
} }

View File

@ -548,16 +548,16 @@ static vsi_status _gpu_register
vsi_status status; vsi_status status;
vx_kernel_description_t* info; vx_kernel_description_t* info;
vx_kernel obj; vx_kernel obj;
vsi_nn_context_t context;
vx_program program = NULL; vx_program program = NULL;
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt; const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
vsi_nn_runtime_option_t* options;
options = ((vsi_nn_graph_prv_t*)graph)->options;
#define MAX_BUILDPROGRAM_LEN 1024 #define MAX_BUILDPROGRAM_LEN 1024
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 }; char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
size_t cost_bytes = 0; size_t cost_bytes = 0;
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN ); memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
context = graph->ctx;
status = VSI_FAILURE; status = VSI_FAILURE;
info = &(kernel->info); info = &(kernel->info);
@ -579,21 +579,21 @@ static vsi_status _gpu_register
return status; return status;
} }
if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE ) if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
{ {
// set default evis version is 2 // set default evis version is 2
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type ) if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
{ {
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
context->config.use_40bits_va ); options->config.use_40bits_va );
} }
} }
else else
{ {
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
context->config.evis.ver, context->config.use_40bits_va ); options->config.evis.ver, options->config.use_40bits_va );
} }
// Pack build option // Pack build option
if( kernel->gpu.sources[active_fmt].build_option.data ) if( kernel->gpu.sources[active_fmt].build_option.data )
@ -655,16 +655,16 @@ static vsi_status _gpu_register_ext
vsi_status status; vsi_status status;
vx_kernel_description_t* info; vx_kernel_description_t* info;
vx_kernel obj; vx_kernel obj;
vsi_nn_context_t context;
vx_program program = NULL; vx_program program = NULL;
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt; const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
vsi_nn_runtime_option_t* options;
options = ((vsi_nn_graph_prv_t*)graph)->options;
#define MAX_BUILDPROGRAM_LEN 1024 #define MAX_BUILDPROGRAM_LEN 1024
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 }; char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
size_t cost_bytes = 0; size_t cost_bytes = 0;
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN ); memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
context = graph->ctx;
status = VSI_FAILURE; status = VSI_FAILURE;
info = &(kernel->info); info = &(kernel->info);
@ -686,21 +686,21 @@ static vsi_status _gpu_register_ext
return status; return status;
} }
if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE ) if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
{ {
// set default evis version is 2 // set default evis version is 2
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type ) if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
{ {
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
context->config.use_40bits_va ); options->config.use_40bits_va );
} }
} }
else else
{ {
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN, cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
context->config.evis.ver, context->config.use_40bits_va ); options->config.evis.ver, options->config.use_40bits_va );
} }
// Pack build option // Pack build option
if( kernel->gpu.sources[active_fmt].build_option.data ) if( kernel->gpu.sources[active_fmt].build_option.data )
@ -1258,7 +1258,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
} }
/* Skip evis if not support */ /* Skip evis if not support */
if( type == VSI_NN_KERNEL_TYPE_EVIS if( type == VSI_NN_KERNEL_TYPE_EVIS
&& graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE ) && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_NONE )
{ {
continue; continue;
} }
@ -1677,7 +1677,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader; int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
if ( graph->ctx->config.subGroupSize == 0 ) if ( ((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize == 0 )
{ {
return FALSE; return FALSE;
} }

View File

@ -162,15 +162,11 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(pow)
#if (VX_TENSOR_GATHER_API_SUPPORT) #if (VX_TENSOR_GATHER_API_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(gather) REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
#endif #endif
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops) REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
#endif
#if (VX_TENSOR_TILE_API_SUPPORT) #if (VX_TENSOR_TILE_API_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(tile) REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
#endif #endif
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm) REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm)
#endif
#if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT) #if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(exp) REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
#endif #endif
@ -184,6 +180,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
#if (VX_BITCAST_VX_SUPPORT) #if (VX_BITCAST_VX_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast) REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
#endif #endif
REGISTER_VX_FIRST_KERNEL_SELECTOR(group_norm)
REGISTER_VX_FIRST_KERNEL_SELECTOR(instance_norm)
__END_DECLS __END_DECLS

View File

@ -0,0 +1,89 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if VX_GROUP_NORMALIZATION_VX_SUPPORT
#define REGISTER_GROUP_NORM_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_GROUP_NORM_OPENVX_KERNEL(group_norm)
{
vx_node node = NULL;
float eps = vsi_nn_kernel_param_get_float32(params, "eps");
int32_t group_num = vsi_nn_kernel_param_get_int32(params, "group_num");
vx_tensor inputs_tensor[3] = { NULL };
vx_tensor output_tensor = NULL;
inputs_tensor[0] = inputs[0]->t;
inputs_tensor[1] = inputs[1]->t;
inputs_tensor[2] = inputs[2]->t;
output_tensor = outputs[0]->t;
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(kernel);
if (graph->ctx->config.support_ffd ||
graph->ctx->config.support_stream_processor)
{
node = vxGroupNormalizationLayer(
graph->g,
eps,
group_num,
inputs_tensor,
(vx_uint32)input_num,
output_tensor
);
}
return (vsi_nn_kernel_node_t)node;
} /* group_norm() */
#endif

View File

@ -0,0 +1,87 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if VX_INSTANCE_NORMALIZATION_VX_SUPPORT
#define REGISTER_INSTANCE_NORM_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_INSTANCE_NORM_OPENVX_KERNEL(instance_norm)
{
vsi_nn_kernel_node_t node = NULL;
float eps = vsi_nn_kernel_param_get_float32(params, "eps");
vx_tensor inputs_tensor[3] = { NULL };
vx_tensor output_tensor = NULL;
inputs_tensor[0] = inputs[0]->t;
inputs_tensor[1] = inputs[1]->t;
inputs_tensor[2] = inputs[2]->t;
output_tensor = outputs[0]->t;
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(kernel);
if (graph->ctx->config.support_ffd ||
graph->ctx->config.support_stream_processor)
{
node = vxInstanceNormalizationLayer(
graph->g,
eps,
inputs_tensor,
(vx_uint32)input_num,
output_tensor
);
}
return (vsi_nn_kernel_node_t)node;
} /* instance_norm() */
#endif

View File

@ -30,7 +30,7 @@
#include "vsi_nn_tensor_util.h" #include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) #if (VX_LAYER_NORMALIZATION_VX_SUPPORT)
#define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \ #define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \ static vsi_nn_kernel_node_t _##kernel_name##setup \
( \ ( \
@ -71,6 +71,11 @@ REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm )
inputs_tensor[2] = inputs[2]->t; inputs_tensor[2] = inputs[2]->t;
output_tensor = outputs[0]->t; output_tensor = outputs[0]->t;
#if !defined(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) || !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
if (graph->ctx->config.support_ffd ||
graph->ctx->config.support_stream_processor)
#endif
{
node = vxLayerNormalizationLayer( node = vxLayerNormalizationLayer(
graph->g, graph->g,
eps, eps,
@ -79,6 +84,7 @@ REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm )
(uint32_t)input_num, (uint32_t)input_num,
output_tensor output_tensor
); );
}
return (vsi_nn_kernel_node_t)node; return (vsi_nn_kernel_node_t)node;
} /* layer_norm() */ } /* layer_norm() */

View File

@ -89,9 +89,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
{ {
vsi_nn_tensor_attr_t attr; vsi_nn_tensor_attr_t attr;
memcpy( &attr, &outputs[0]->attr, sizeof( attr ) ); memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) ); memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
attr.vtl = FALSE; attr.vtl = TRUE;
attr.is_const = FALSE; attr.is_const = FALSE;
convert_tensor = vsi_nn_CreateTensor(graph, &attr); convert_tensor = vsi_nn_CreateTensor(graph, &attr);

View File

@ -30,7 +30,7 @@
#include "vsi_nn_tensor_util.h" #include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT) #if (VX_RELATIONAL_OPS_VX_SUPPORT)
#define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \ #define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \ static vsi_nn_kernel_node_t _##kernel_name##setup \
@ -68,12 +68,25 @@ REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops )
VSI_UNREFERENCED(kernel); VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
node = vxRelationalLayer(graph->g, #if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
if (vsi_nn_is_broadcast_operaton(inputs, input_num, outputs[0]))
{
return NULL;
}
#endif
#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
if (graph->ctx->config.support_stream_processor)
#endif
{
node = vxRelationalLayer(
graph->g,
operation, operation,
inputs_tensor, inputs_tensor,
(uint32_t)input_num, (uint32_t)input_num,
outputs[0]->t outputs[0]->t
); );
}
return (vsi_nn_kernel_node_t)node; return (vsi_nn_kernel_node_t)node;
} /* relational_ops() */ } /* relational_ops() */

View File

@ -23,6 +23,7 @@
*****************************************************************************/ *****************************************************************************/
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h" #include "vsi_nn_tensor.h"
#include "vsi_nn_node.h" #include "vsi_nn_node.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
@ -66,7 +67,7 @@ REGISTER_SWISH_OPENVX_KERNEL( swish )
VSI_UNREFERENCED(output_num); VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(input_num);
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
{ {
swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type"); swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");

View File

@ -67,8 +67,8 @@ __kernel void cumsum_F32toF32_axis2(
} }
} }
#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \ #define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__kernel void cumsum_##name##toU8_axis2( \ __kernel void cumsum_##name##_axis2( \
__read_only image2d_array_t input, \ __read_only image2d_array_t input, \
__write_only image2d_array_t output, \ __write_only image2d_array_t output, \
int axis, \ int axis, \
@ -87,19 +87,19 @@ __kernel void cumsum_##name##toU8_axis2( \
int4 coord_out = coord; \ int4 coord_out = coord; \
\ \
src_type sum = (src_type)(0); \ src_type sum = (src_type)(0); \
uint4 dst = (uint4)(0); \ dst_type dst = (dst_type)(0); \
int tmp_zp = convert_int_rte(output_zp); \ int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); \ dst.x = convert_dtype(tmp_zp); \
\ \
float cnt = 0.0f; \ float cnt = 0.0f; \
\ \
if(exclusive && rev) \ if(exclusive && rev) \
{ \ { \
coord_out.z = channel - 1; \ coord_out.z = channel - 1; \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
for(coord.z = channel - 1; coord.z > 0; coord.z--) \ for(coord.z = channel - 1; coord.z > 0; coord.z--) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
coord_out.z--; \ coord_out.z--; \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
@ -107,17 +107,17 @@ __kernel void cumsum_##name##toU8_axis2( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
} \ } \
} \ } \
else if(exclusive) \ else if(exclusive) \
{ \ { \
coord_out.z = 0; \ coord_out.z = 0; \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
for(coord.z = 0; coord.z < channel - 1; coord.z++) \ for(coord.z = 0; coord.z < channel - 1; coord.z++) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
coord_out.z++; \ coord_out.z++; \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
@ -125,45 +125,44 @@ __kernel void cumsum_##name##toU8_axis2( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
} \ } \
} \ } \
else if(rev) \ else if(rev) \
{ \ { \
for(coord.z = channel - 1; coord.z >= 0; coord.z--) \ for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
\ \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord, dst); \ image_write(output, coord, dst); \
} \ } \
} \ } \
else \ else \
{ \ { \
for(coord.z = 0; coord.z < channel; coord.z++) \ for(coord.z = 0; coord.z < channel; coord.z++) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
\ \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord, dst); \ image_write(output, coord, dst); \
} \ } \
} \ } \
} }
CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui) CUMSUM_toINT_AXIS2_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef) CUMSUM_toINT_AXIS2_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toINT_AXIS2_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
__kernel void cumsum_F32toF32_axis1( __kernel void cumsum_F32toF32_axis1(
__read_only image2d_array_t input, __read_only image2d_array_t input,
@ -233,8 +232,8 @@ __kernel void cumsum_F32toF32_axis1(
} }
} }
#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \ #define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__kernel void cumsum_##name##toU8_axis1( \ __kernel void cumsum_##name##_axis1( \
__read_only image2d_array_t input, \ __read_only image2d_array_t input, \
__write_only image2d_array_t output, \ __write_only image2d_array_t output, \
int axis, \ int axis, \
@ -253,20 +252,20 @@ __kernel void cumsum_##name##toU8_axis1( \
int4 coord_out = coord; \ int4 coord_out = coord; \
\ \
src_type sum = (src_type)(0); \ src_type sum = (src_type)(0); \
uint4 dst = (uint4)(0); \ dst_type dst = (dst_type)(0); \
int tmp_zp = convert_int_rte(output_zp); \ int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); \ dst.x = convert_dtype(tmp_zp); \
\ \
float cnt = 0; \ float cnt = 0; \
\ \
if(exclusive && rev) \ if(exclusive && rev) \
{ \ { \
coord_out.y = height - 1; \ coord_out.y = height - 1; \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
\ \
for(coord.y = height - 1; coord.y > 0; coord.y--) \ for(coord.y = height - 1; coord.y > 0; coord.y--) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
cnt += 1.0f; \ cnt += 1.0f; \
coord_out.y--; \ coord_out.y--; \
sum += data; \ sum += data; \
@ -274,17 +273,17 @@ __kernel void cumsum_##name##toU8_axis1( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
} \ } \
} \ } \
else if(exclusive) \ else if(exclusive) \
{ \ { \
coord_out.y = 0; \ coord_out.y = 0; \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
for(coord.y = 0; coord.y < height - 1; coord.y++) \ for(coord.y = 0; coord.y < height - 1; coord.y++) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
cnt += 1.0f; \ cnt += 1.0f; \
coord_out.y++; \ coord_out.y++; \
sum += data; \ sum += data; \
@ -292,44 +291,44 @@ __kernel void cumsum_##name##toU8_axis1( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
} \ } \
} \ } \
else if(rev) \ else if(rev) \
{ \ { \
for(coord.y = height - 1; coord.y >= 0; coord.y--) \ for(coord.y = height - 1; coord.y >= 0; coord.y--) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
\ \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord, dst); \ image_write(output, coord, dst); \
} \ } \
} \ } \
else \ else \
{ \ { \
for(coord.y = 0; coord.y < height; coord.y++) \ for(coord.y = 0; coord.y < height; coord.y++) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
\ \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord, dst); \ image_write(output, coord, dst); \
} \ } \
} \ } \
} }
CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui) CUMSUM_toINT_AXIS1_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef) CUMSUM_toINT_AXIS1_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toINT_AXIS1_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
__kernel void cumsum_F32toF32_axis0( __kernel void cumsum_F32toF32_axis0(
__read_only image2d_array_t input, __read_only image2d_array_t input,
@ -399,8 +398,8 @@ __kernel void cumsum_F32toF32_axis0(
} }
} }
#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \ #define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__kernel void cumsum_##name##toU8_axis0( \ __kernel void cumsum_##name##_axis0( \
__read_only image2d_array_t input, \ __read_only image2d_array_t input, \
__write_only image2d_array_t output, \ __write_only image2d_array_t output, \
int axis, \ int axis, \
@ -419,19 +418,19 @@ __kernel void cumsum_##name##toU8_axis0( \
int4 coord_out = coord; \ int4 coord_out = coord; \
\ \
src_type sum = (src_type)(0); \ src_type sum = (src_type)(0); \
uint4 dst = (uint4)(0); \ dst_type dst = (dst_type)(0); \
int tmp_zp = convert_int_rte(output_zp); \ int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); \ dst.x = convert_dtype(tmp_zp); \
\ \
float cnt = 0; \ float cnt = 0; \
\ \
if(exclusive && rev) \ if(exclusive && rev) \
{ \ { \
coord_out.x = width - 1; \ coord_out.x = width - 1; \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
for(coord.x = width - 1; coord.x > 0; coord.x--) \ for(coord.x = width - 1; coord.x > 0; coord.x--) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
coord_out.x--; \ coord_out.x--; \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
@ -439,8 +438,8 @@ __kernel void cumsum_##name##toU8_axis0( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
} \ } \
} \ } \
else if(exclusive) \ else if(exclusive) \
@ -449,7 +448,7 @@ __kernel void cumsum_##name##toU8_axis0( \
write_imageui(output, coord_out, dst); \ write_imageui(output, coord_out, dst); \
for(coord.x = 0; coord.x < width - 1; coord.x++) \ for(coord.x = 0; coord.x < width - 1; coord.x++) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
coord_out.x++; \ coord_out.x++; \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
@ -457,40 +456,42 @@ __kernel void cumsum_##name##toU8_axis0( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord_out, dst); \ image_write(output, coord_out, dst); \
} \ } \
} \ } \
else if(rev) \ else if(rev) \
{ \ { \
for(coord.x = width - 1; coord.x >= 0; coord.x--) \ for(coord.x = width - 1; coord.x >= 0; coord.x--) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
\ \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord, dst); \ image_write(output, coord, dst); \
} \ } \
} \ } \
else \ else \
{ \ { \
for(coord.x = 0; coord.x < width; coord.x++) \ for(coord.x = 0; coord.x < width; coord.x++) \
{ \ { \
src_type data = read_image_type(input, coord); \ src_type data = image_read(input, coord); \
cnt += 1.0f; \ cnt += 1.0f; \
sum += data; \ sum += data; \
\ \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \ float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\ \
dst.x = (uint)convert_int_rte(tmpSum); \ dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord, dst); \ image_write(output, coord, dst); \
} \ } \
} \ } \
} }
CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui) CUMSUM_toINT_AXIS0_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef) CUMSUM_toINT_AXIS0_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toINT_AXIS0_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)

View File

@ -65,188 +65,100 @@ __kernel void cumsum_F32toF32_axis1_2D(
} }
} }
__kernel void cumsum_U8toU8_axis1_2D( #define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__read_only image2d_t input, __kernel void cumsum_##name##_axis1_2D( \
__write_only image2d_t output, __read_only image2d_t input, \
int axis, __write_only image2d_t output, \
int exclusive, int axis, \
int rev, int exclusive, \
int width, int rev, \
int height, int width, \
int chn, int height, \
int input_zp, int chn, \
float in_out_scale, int input_zp, \
float in_out_zp_scale, float in_out_scale, \
float output_zp float in_out_zp_scale, \
) float output_zp \
{ ) \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); { \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
uint4 sum = (uint4)(0); \
uint4 dst = (uint4)(0); src_type sum = (src_type)(0); \
dst_type dst = (dst_type)(0); \
int tmp_zp = convert_int_rte(output_zp); int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); dst.x = convert_dtype(tmp_zp); \
\
float cnt = 0; float cnt = 0; \
\
if(exclusive && rev) if(exclusive && rev) \
{ { \
coord.w = height - 1; coord.w = height - 1; \
write_imageui(output, coord.zw, dst); image_write(output, coord.zw, dst); \
for(coord.y = height - 1; coord.y > 0; coord.y--) for(coord.y = height - 1; coord.y > 0; coord.y--) \
{ { \
uint4 data = read_imageui(input, coord.xy); src_type data = image_read(input, coord.xy); \
cnt += 1.0f; cnt += 1.0f; \
coord.w--; coord.w--; \
sum += data; sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord.zw, dst); image_write(output, coord.zw, dst); \
} } \
} } \
else if(exclusive) else if(exclusive) \
{ { \
write_imageui(output, coord.zw, dst); image_write(output, coord.zw, dst); \
for(coord.y = 0; coord.y < height - 1; coord.y++) for(coord.y = 0; coord.y < height - 1; coord.y++) \
{ { \
uint4 data = read_imageui(input, coord.xy); src_type data = image_read(input, coord.xy); \
cnt += 1.0f; cnt += 1.0f; \
coord.w++; coord.w++; \
sum += data; sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord.zw, dst); image_write(output, coord.zw, dst); \
} } \
} } \
else if(rev) else if(rev) \
{ { \
for(coord.y = height - 1; coord.y >= 0; coord.y--) for(coord.y = height - 1; coord.y >= 0; coord.y--) \
{ { \
uint4 data = read_imageui(input, coord.xy); src_type data = image_read(input, coord.xy); \
cnt += 1.0f; cnt += 1.0f; \
sum += data; sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord.xy, dst); image_write(output, coord.xy, dst); \
} } \
} } \
else else \
{ { \
for(coord.y = 0; coord.y < height; coord.y++) for(coord.y = 0; coord.y < height; coord.y++) \
{ { \
uint4 data = read_imageui(input, coord.xy); src_type data = image_read(input, coord.xy); \
cnt += 1.0f; cnt += 1.0f; \
sum += data; sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); dst.x = convert_dtype(tmpSum); \
write_imageui(output, coord.xy, dst); image_write(output, coord.xy, dst); \
} } \
} } \
}
__kernel void cumsum_F32toU8_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float4 sum = (float4)(0);
uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0;
if(exclusive && rev)
{
coord.w = height - 1;
write_imageui(output, coord.zw, dst);
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
coord.w--;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(exclusive)
{
write_imageui(output, coord.zw, dst);
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
coord.w++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
} }
CUMSUM_INT_AXIS1_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_INT_AXIS1_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
__kernel void cumsum_F32toF32_axis0_2D( __kernel void cumsum_F32toF32_axis0_2D(
__read_only image2d_t input, __read_only image2d_t input,
@ -316,188 +228,100 @@ __kernel void cumsum_F32toF32_axis0_2D(
} }
} }
__kernel void cumsum_U8toU8_axis0_2D( #define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__read_only image2d_t input, __kernel void cumsum_##name##_axis0_2D( \
__write_only image2d_t output, __read_only image2d_t input, \
int axis, __write_only image2d_t output, \
int exclusive, int axis, \
int rev, int exclusive, \
int width, int rev, \
int height, int width, \
int chn, int height, \
int input_zp, int chn, \
float in_out_scale, int input_zp, \
float in_out_zp_scale, float in_out_scale, \
float output_zp float in_out_zp_scale, \
) float output_zp \
{ ) \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); { \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
uint4 sum = (uint4)(0); \
uint4 dst = (uint4)(0); src_type sum = (src_type)(0); \
dst_type dst = (dst_type)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_dtype(tmp_zp); \
float cnt = 0.0f; \
float cnt = 0.0f; \
if(exclusive && rev) \
{ if(exclusive && rev) \
coord.x = width - 1; { \
coord.z = coord.x; coord.x = width - 1; \
write_imageui(output, coord.zw, dst); coord.z = coord.x; \
for(; coord.x > 0; coord.x--) image_write(output, coord.zw, dst); \
{ for(; coord.x > 0; coord.x--) \
uint4 data = read_imageui(input, coord.xy); { \
coord.z--; src_type data = image_read(input, coord.xy); \
cnt += 1.0; coord.z--; \
sum += data; cnt += 1.0; \
sum += data; \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord.zw, dst); dst.x = convert_dtype(tmpSum); \
} image_write(output, coord.zw, dst); \
} } \
else if(exclusive) } \
{ else if(exclusive) \
coord.z = 0; { \
write_imageui(output, coord.zw, dst); coord.z = 0; \
for(coord.x = 0; coord.x < width - 1; coord.x++) image_write(output, coord.zw, dst); \
{ for(coord.x = 0; coord.x < width - 1; coord.x++) \
uint4 data = read_imageui(input, coord.xy); { \
cnt += 1.0f; src_type data = image_read(input, coord.xy); \
coord.z++; cnt += 1.0f; \
sum += data; coord.z++; \
sum += data; \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord.zw, dst); dst.x = convert_dtype(tmpSum); \
} image_write(output, coord.zw, dst); \
} } \
else if(rev) } \
{ else if(rev) \
for(coord.x = width - 1; coord.x >= 0; coord.x--) { \
{ for(coord.x = width - 1; coord.x >= 0; coord.x--) \
uint4 data = read_imageui(input, coord.xy); { \
cnt += 1.0f; src_type data = image_read(input, coord.xy); \
sum += data; cnt += 1.0f; \
sum += data; \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord.xy, dst); dst.x = convert_dtype(tmpSum); \
} image_write(output, coord.xy, dst); \
} } \
else } \
{ else \
for(coord.x = 0; coord.x < width; coord.x++) { \
{ for(coord.x = 0; coord.x < width; coord.x++) \
uint4 data = read_imageui(input, coord.xy); { \
cnt += 1.0f; src_type data = image_read(input, coord.xy); \
sum += data; cnt += 1.0f; \
sum += data; \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord.xy, dst); dst.x = convert_dtype(tmpSum); \
} image_write(output, coord.xy, dst); \
} } \
} } \
__kernel void cumsum_F32toU8_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float4 sum = (float4)(0);
uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0.0f;
if(exclusive && rev)
{
coord.x = width - 1;
coord.z = coord.x;
write_imageui(output, coord.zw, dst);
for(; coord.x > 0; coord.x--)
{
float4 data = read_imagef(input, coord.xy);
coord.z--;
cnt += 1.0;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(exclusive)
{
coord.z = 0;
write_imageui(output, coord.zw, dst);
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
coord.z++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
} }
CUMSUM_INT_AXIS0_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_INT_AXIS0_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)

View File

@ -132,3 +132,30 @@ __kernel void one_hot_U8toU8
coord.z ++; coord.z ++;
} while (coord.z < depth); } while (coord.z < depth);
} }
__kernel void one_hot_I32toBF16
(
__read_only image2d_t input,
__write_only image2d_array_t output,
int depth,
uint on_value,
uint off_value,
float inputScale,
float inputTail
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 src = read_imagei(input, coord.xy);
int val = convert_int(convert_float(src.x) * inputScale - inputTail);
do
{
uint4 dst;
dst.x = val == coord.z ? on_value : off_value;
write_imageui(output, coord.xzyw, dst.xxxx);
coord.z ++;
} while (coord.z < depth);
}

View File

@ -0,0 +1,373 @@
__kernel void rope_F32_F32toF32_axis0
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos, sin;
READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
coord.x = coord.x * step;
float4 src0 = read_imagef(input, coord);
int4 coord_out = coord;
coord.x += half_head_size;
float4 src1 = read_imagef(input, coord);
float4 dst0 = src0 * cos - src1 * sin;
float4 dst1 = src0 * sin + src1 * cos;
write_imagef(output, coord_out, dst0);
coord_out.x += half_head_size;
write_imagef(output, coord_out, dst1);
}
__kernel void rope_F32_F32toF32_axis1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos, sin;
READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
coord.y = coord.y * step;
float4 src0 = read_imagef(input, coord);
int4 coord_out = coord;
coord.y += half_head_size;
float4 src1 = read_imagef(input, coord);
float4 dst0 = src0 * cos - src1 * sin;
float4 dst1 = src0 * sin + src1 * cos;
write_imagef(output, coord_out, dst0);
coord_out.y += half_head_size;
write_imagef(output, coord_out, dst1);
}
__kernel void rope_F32_F32toF32_axis2
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos = read_imagef(cos_cache, coord);
float4 sin = read_imagef(sin_cache, coord);
coord.z = coord.z * step;
float4 src0 = read_imagef(input, coord);
int4 coord_out = coord;
coord.z += half_head_size;
float4 src1 = read_imagef(input, coord);
float4 dst0 = src0 * cos - src1 * sin;
float4 dst1 = src0 * sin + src1 * cos;
write_imagef(output, coord_out, dst0);
coord_out.z += half_head_size;
write_imagef(output, coord_out, dst1);
}
__kernel void rope_I32_I32toI32_axis0
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 _cos, _sin;
float4 cos, sin;
READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
coord.x = coord.x * step;
float4 src0 = convert_float4(read_imagei(input, coord));
int4 coord_out = coord;
coord.x += half_head_size;
float4 src1 = convert_float4(read_imagei(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = convert_float4(_cos) - cos_zp;
sin = convert_float4(_sin) - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
int4 dst0 = convert_int4_rte(_dst0);
int4 dst1 = convert_int4_rte(_dst1);
write_imagei(output, coord_out, dst0);
coord_out.x += half_head_size;
write_imagei(output, coord_out, dst1);
}
__kernel void rope_I32_I32toI32_axis1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 _cos, _sin;
float4 cos, sin;
READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
coord.y = coord.y * step;
float4 src0 = convert_float4(read_imagei(input, coord));
int4 coord_out = coord;
coord.y += half_head_size;
float4 src1 = convert_float4(read_imagei(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = convert_float4(_cos) - cos_zp;
sin = convert_float4(_sin) - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
int4 dst0 = convert_int4_rte(_dst0);
int4 dst1 = convert_int4_rte(_dst1);
write_imagei(output, coord_out, dst0);
coord_out.y += half_head_size;
write_imagei(output, coord_out, dst1);
}
__kernel void rope_I32_I32toI32_axis2
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos = convert_float4(read_imagei(cos_cache, coord));
float4 sin = convert_float4(read_imagei(sin_cache, coord));
coord.z = coord.z * step;
float4 src0 = convert_float4(read_imagei(input, coord));
int4 coord_out = coord;
coord.z += half_head_size;
float4 src1 = convert_float4(read_imagei(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = cos - cos_zp;
sin = sin - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
int4 dst0 = convert_int4_rte(_dst0);
int4 dst1 = convert_int4_rte(_dst1);
write_imagei(output, coord_out, dst0);
coord_out.z += half_head_size;
write_imagei(output, coord_out, dst1);
}
__kernel void rope_U32_U32toU32_axis0
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
uint4 _cos, _sin;
float4 cos, sin;
READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
coord.x = coord.x * step;
float4 src0 = convert_float4(read_imageui(input, coord));
int4 coord_out = coord;
coord.x += half_head_size;
float4 src1 = convert_float4(read_imageui(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = convert_float4(_cos) - cos_zp;
sin = convert_float4(_sin) - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
uint4 dst0 = convert_uint4_rte(_dst0);
uint4 dst1 = convert_uint4_rte(_dst1);
write_imageui(output, coord_out, dst0);
coord_out.x += half_head_size;
write_imageui(output, coord_out, dst1);
}
__kernel void rope_U32_U32toU32_axis1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
uint4 _cos, _sin;
float4 cos, sin;
READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
coord.y = coord.y * step;
float4 src0 = convert_float4(read_imageui(input, coord));
int4 coord_out = coord;
coord.y += half_head_size;
float4 src1 = convert_float4(read_imageui(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = convert_float4(_cos) - cos_zp;
sin = convert_float4(_sin) - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
uint4 dst0 = convert_uint4_rte(_dst0);
uint4 dst1 = convert_uint4_rte(_dst1);
write_imageui(output, coord_out, dst0);
coord_out.y += half_head_size;
write_imageui(output, coord_out, dst1);
}
__kernel void rope_U32_U32toU32_axis2
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos = convert_float4(read_imageui(cos_cache, coord));
float4 sin = convert_float4(read_imageui(sin_cache, coord));
coord.z = coord.z * step;
float4 src0 = convert_float4(read_imageui(input, coord));
int4 coord_out = coord;
coord.z += half_head_size;
float4 src1 = convert_float4(read_imageui(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = cos - cos_zp;
sin = sin - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
uint4 dst0 = convert_uint4_rte(_dst0);
uint4 dst1 = convert_uint4_rte(_dst1);
write_imageui(output, coord_out, dst0);
coord_out.z += half_head_size;
write_imageui(output, coord_out, dst1);
}

View File

@ -0,0 +1,307 @@
#include "cl_viv_vx_ext.h"
_viv_uniform int top;
_viv_uniform int left;
_viv_uniform float out_scale_r;
_viv_uniform float out_scale_g;
_viv_uniform float out_scale_b;
_viv_uniform float out_zp_r;
_viv_uniform float out_zp_g;
_viv_uniform float out_zp_b;
_viv_uniform float pad_v_r;
_viv_uniform float pad_v_g;
_viv_uniform float pad_v_b;
_viv_uniform float scale_w;
_viv_uniform float scale_h;
_viv_uniform int resize_max_w;
_viv_uniform int resize_max_h;
_viv_uniform int out_height;
_viv_uniform int r_order;
_viv_uniform int b_order;
_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
_viv_uniform VXC_512Bits uniLeftToFloat32_4x4;
_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
__kernel void custom_letterbox_U8toU8
(
__read_only image2d_t input,
__write_only image2d_t output,
int top_,
int bottom_,
int left_,
int right_,
float mean_r_,
float mean_g_,
float mean_b_,
float scale_r_,
float scale_g_,
float scale_b_,
int pad_r_,
int pad_g_,
int pad_b_,
int reverse_channel
)
{
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
int2 coord = coord_out;
uint4 dst = (uint4)(0,0,0,0);
vxc_uchar8 result;
if (coord_out.x < left || coord_out.x >= resize_max_w ||
coord_out.y < top || coord_out.y >= resize_max_h)
{
dst.x = convert_uint(pad_v_r);
coord.y = coord_out.y + r_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_uint(pad_v_g);
coord.y = coord_out.y + out_height;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_uint(pad_v_b);
coord.y = coord_out.y + b_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
return;
}
float in_x = convert_float(coord_out.x - left) * scale_w;
float in_y = convert_float(coord_out.y - top) * scale_h;
float left_x_f = floor(in_x);
float top_y_f = floor(in_y);
float x_lerp = in_x - left_x_f;
float y_lerp = in_y - top_y_f;
int left_x_idx = convert_int(left_x_f);
int top_y_idx = convert_int(top_y_f);
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
vxc_uchar8 top_data, bottom_data;
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
float4 left4 = (float4)(0,0,0,0);
float4 right4 = (float4)(0,0,0,0);
float4 top4 = (float4)(0,0,0,0);
float4 bottom4 = (float4)(0,0,0,0);
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
bottom4 = right4 * x_lerp + left4;
float4 out = (bottom4 - top4) * y_lerp + top4;
dst.x = convert_uint(out.x * out_scale_r + out_zp_r );
coord.y = coord_out.y + r_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_uint(out.y * out_scale_g + out_zp_g);
coord.y = coord_out.y + out_height;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_uint(out.z * out_scale_b + out_zp_b);
coord.y = coord_out.y + b_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_letterbox_U8toI8
(
__read_only image2d_t input,
__write_only image2d_t output,
int top_,
int bottom_,
int left_,
int right_,
float mean_r_,
float mean_g_,
float mean_b_,
float scale_r_,
float scale_g_,
float scale_b_,
int pad_r_,
int pad_g_,
int pad_b_,
int reverse_channel
)
{
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
int2 coord = coord_out;
int4 dst = (int4)(0,0,0,0);
vxc_char8 result;
if (coord_out.x < left || coord_out.x >= resize_max_w ||
coord_out.y < top || coord_out.y >= resize_max_h)
{
dst.x = convert_int(pad_v_r);
coord.y = coord_out.y + r_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_int(pad_v_g);
coord.y = coord_out.y + out_height;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_int(pad_v_b);
coord.y = coord_out.y + b_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
return;
}
float in_x = convert_float(coord_out.x - left) * scale_w;
float in_y = convert_float(coord_out.y - top) * scale_h;
float left_x_f = floor(in_x);
float top_y_f = floor(in_y);
float x_lerp = in_x - left_x_f;
float y_lerp = in_y - top_y_f;
int left_x_idx = convert_int(left_x_f);
int top_y_idx = convert_int(top_y_f);
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
vxc_char8 top_data, bottom_data;
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
float4 left4 = (float4)(0,0,0,0);
float4 right4 = (float4)(0,0,0,0);
float4 top4 = (float4)(0,0,0,0);
float4 bottom4 = (float4)(0,0,0,0);
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
bottom4 = right4 * x_lerp + left4;
float4 out = (bottom4 - top4) * y_lerp + top4;
dst.x = convert_int(out.x * out_scale_r + out_zp_r);
coord.y = coord_out.y + r_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_int(out.y * out_scale_g + out_zp_g);
coord.y = coord_out.y + out_height;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_int(out.z * out_scale_b + out_zp_b);
coord.y = coord_out.y + b_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_letterbox_U8toF16
(
__read_only image2d_t input,
__write_only image2d_t output,
int top_,
int bottom_,
int left_,
int right_,
float mean_r_,
float mean_g_,
float mean_b_,
float scale_r_,
float scale_g_,
float scale_b_,
int pad_r_,
int pad_g_,
int pad_b_,
int reverse_channel
)
{
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
int2 coord = coord_out;
half4 tmp;
vxc_half8 dst_temp;
vxc_ushort8 dst;
if (coord_out.x < left || coord_out.x >= resize_max_w ||
coord_out.y < top || coord_out.y >= resize_max_h)
{
float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0);
_viv_asm(CONV, tmp, pad);
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + r_order;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
tmp.x = tmp.y;
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + out_height;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
tmp.x = tmp.z;
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + b_order;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
return;
}
float in_x = convert_float(coord_out.x - left) * scale_w;
float in_y = convert_float(coord_out.y - top) * scale_h;
float left_x_f = floor(in_x);
float top_y_f = floor(in_y);
float x_lerp = in_x - left_x_f;
float y_lerp = in_y - top_y_f;
int left_x_idx = convert_int(left_x_f);
int top_y_idx = convert_int(top_y_f);
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
vxc_uchar8 top_data, bottom_data;
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
float4 left4 = (float4)(0,0,0,0);
float4 right4 = (float4)(0,0,0,0);
float4 top4 = (float4)(0,0,0,0);
float4 bottom4 = (float4)(0,0,0,0);
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
bottom4 = right4 * x_lerp + left4;
float4 out = (bottom4 - top4) * y_lerp + top4;
float4 out_temp = (float4)(0,0,0,0);
out_temp.x = out.x * out_scale_r + out_zp_r;
_viv_asm(CONV, tmp, out_temp);
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + r_order;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
out_temp.x = out.y * out_scale_g + out_zp_g;
_viv_asm(CONV, tmp, out_temp);
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + out_height;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
out_temp.x = out.z * out_scale_b + out_zp_b;
_viv_asm(CONV, tmp, out_temp);
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + out_height;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}

View File

@ -10,7 +10,12 @@
#include "cl_viv_vx_ext.h" #include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32; _viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;
_viv_uniform VXC_512Bits uniExtract8Bin_2x8;
_viv_uniform int sf_size; _viv_uniform int sf_size;
_viv_uniform float srcScale;
_viv_uniform float srcZP;
_viv_uniform float dstScale;
_viv_uniform float dstZP;
#define F_MAX(a,b) ((a)>(b)?(a):(b)) #define F_MAX(a,b) ((a)>(b)?(a):(b))
__kernel void Softmax2VXC __kernel void Softmax2VXC
( (
@ -19,35 +24,37 @@ __kernel void Softmax2VXC
int axis int axis
) )
{ {
int4 coord_in = (int4)(0,0,0,0); int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
float fMax = 0.0; float fMax = 0;
for (int i = 0; i < sf_size; i++) for (int i = 0; i < sf_size; i++)
{ {
vxc_char8 val; vxc_short8 val;
vxc_half8 val_h;
coord_in.x = i; coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, val_h, val, 16);
float fval; float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
fMax = F_MAX(fMax, fval); fMax = F_MAX(fMax, fval);
} }
float fProbSum = 0.0f; float fProbSum = 0.0f;
vxc_short8 dst; vxc_short8 dst;
for (int i = 0; i < sf_size; i++) for (int i = 0; i < sf_size; i++)
{ {
vxc_char8 val; vxc_short8 val;
vxc_half8 val_h;
coord_in.x = i; coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, val_h, val, 16);
float fval; float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
float fOut = (float)exp(fval - fMax); float fOut = (float)exp(fval - fMax);
fProbSum += fOut; fProbSum += fOut;
half hVal; half hVal;
_viv_asm(CONV, hVal, fOut); _viv_asm(CONV, hVal, fOut);
_viv_asm(COPY, dst, hVal, 4); _viv_asm(COPY, dst, hVal, 4);
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
} }
@ -60,7 +67,6 @@ __kernel void Softmax2VXC
float fval; float fval;
_viv_asm(COPY, val_h,val, 16); _viv_asm(COPY, val_h,val, 16);
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32); VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
float fOut =fval / fProbSum; float fOut =fval / fProbSum;
half hVal; half hVal;
_viv_asm(CONV, hVal, fOut); _viv_asm(CONV, hVal, fOut);
@ -68,3 +74,57 @@ __kernel void Softmax2VXC
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
} }
} }
__kernel void Softmax2VXC_u8
(
image2d_array_t input,
image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
float fMax = -3.4e38f;
for (int i = 0; i < sf_size; i++)
{
vxc_uchar8 val;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
fval = (fval - srcZP) * srcScale;
fMax = F_MAX(fMax, fval);
}
float fProbSum = 0.0f;
vxc_uchar8 dst;
for (int i = 0; i < sf_size; i++)
{
vxc_uchar8 val;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
fval = (fval - srcZP) * srcScale;
float fOut = (float)exp(fval - fMax);
fProbSum += fOut;
}
for (int i = 0; i < sf_size; i++)
{
vxc_uchar8 val;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
fval = (fval - srcZP) * srcScale;
float fOut = exp(fval - fMax) / fProbSum;
fOut = fOut * dstScale + dstZP;
short dst0;
_viv_asm(CONV, dst0, fOut);
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8);
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -16,7 +16,7 @@ _viv_uniform float sum_x2_tail1;
_viv_uniform float output_scale; _viv_uniform float output_scale;
_viv_uniform float output_zp; _viv_uniform float output_zp;
#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \ #define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
__read_only image2d_array_t input, \ __read_only image2d_array_t input, \
__write_only image2d_array_t output, \ __write_only image2d_array_t output, \
@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
int lidx = get_local_id(0); \ int lidx = get_local_id(0); \
int gidz = get_global_id(1); \ int gidz = get_global_id(1); \
int4 coord = (int4)(gidx, 0, gidz, 0); \ int4 coord = (int4)(gidx, 0, gidz, 0); \
vxc_short8 src0; \ load_type src; \
src_type in_h; \ src_type in_h; \
float4 sumsqr; \ float4 sumsqr; \
float4 tmpSumSqr = (float4)(0); \ float4 tmpSumSqr = (float4)(0); \
@ -43,9 +43,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
{ \ { \
for(coord.y = 0; coord.y < height;) \ for(coord.y = 0; coord.y < height;) \
{ \ { \
VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
coord.y++; \ coord.y++; \
_viv_asm(COPY, in_h, src0, 16); \ _viv_asm(COPY, in_h, src, 16); \
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
tmpSumSqr += sumsqr; \ tmpSumSqr += sumsqr; \
} \ } \
@ -76,10 +76,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
write_imagef(output, coord_out, data); \ write_imagef(output, coord_out, data); \
} \ } \
} }
GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8) GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8, vxc_half8)
GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8) GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8, vxc_short8)
GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8)
#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \ #define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
__read_only image2d_array_t input, \ __read_only image2d_array_t input, \
__write_only image2d_array_t output, \ __write_only image2d_array_t output, \
@ -89,7 +90,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
int lidx = get_local_id(0); \ int lidx = get_local_id(0); \
\ \
int2 coord = (int2)(gidx, get_global_id(1)); \ int2 coord = (int2)(gidx, get_global_id(1)); \
vxc_short8 src0; \ load_type src; \
src_type in_h; \ src_type in_h; \
float4 sumsqr = (float4)(0); \ float4 sumsqr = (float4)(0); \
\ \
@ -98,8 +99,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
\ \
if(gidx < width) \ if(gidx < width) \
{ \ { \
VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, in_h, src0, 16); \ _viv_asm(COPY, in_h, src, 16); \
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \ sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \
sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \ sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \
@ -128,8 +129,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
write_imagef(output, coord_out, data); \ write_imagef(output, coord_out, data); \
} \ } \
} }
GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8) GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8)
GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8) GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8)
GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8)
#define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \ #define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
@ -178,7 +180,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
_viv_asm(CONV_RTE, tmpVal0, norm); \ _viv_asm(CONV_RTE, tmpVal0, norm); \
norm = alpha * tmpData1 + bias_val; \ norm = alpha * tmpData1 + bias_val; \
_viv_asm(CONV_RTE, tmpVal1, norm); \ _viv_asm(CONV_RTE, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, outval, dst, 16); \ _viv_asm(COPY, outval, dst, 16); \
VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
} }
@ -230,10 +232,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
float4 norm; \ float4 norm; \
norm = alpha * tmpData0 + bias_val; \ norm = alpha * tmpData0 + bias_val; \
_viv_asm(CONV, tmpVal0, norm); \ _viv_asm(CONV_RTE, tmpVal0, norm); \
norm = alpha * tmpData1 + bias_val; \ norm = alpha * tmpData1 + bias_val; \
_viv_asm(CONV, tmpVal1, norm); \ _viv_asm(CONV_RTE, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, outval, dst, 16); \ _viv_asm(COPY, outval, dst, 16); \
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
} }
@ -283,7 +285,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
\ \
float4 norm; \ float4 norm; \
norm = alpha * tmpData0 + bias_val; \ norm = alpha * tmpData0 + bias_val; \
_viv_asm(CONV, tmpVal0, norm); \ _viv_asm(CONV_RTE, tmpVal0, norm); \
norm = alpha * tmpData1 + bias_val; \ norm = alpha * tmpData1 + bias_val; \
_viv_asm(CONV_RTE, tmpVal1, norm); \ _viv_asm(CONV_RTE, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
@ -296,6 +298,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int
GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
#define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \ #define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
@ -333,10 +336,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
float4 norm; \ float4 norm; \
norm = alpha * tmpData0 + bias_val; \ norm = alpha * tmpData0 + bias_val; \
_viv_asm(CONV, tmpVal0, norm); \ _viv_asm(CONV_RTE, tmpVal0, norm); \
norm = alpha * tmpData1 + bias_val; \ norm = alpha * tmpData1 + bias_val; \
_viv_asm(CONV, tmpVal1, norm); \ _viv_asm(CONV_RTE, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, outval, dst, 16); \ _viv_asm(COPY, outval, dst, 16); \
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
} }
@ -346,4 +349,5 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8,
GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)

View File

@ -115,45 +115,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;
_viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4; _viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;
_viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4; _viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;
_viv_uniform VXC_512Bits uniExtact8Bin_2x8; _viv_uniform VXC_512Bits uniExtact8Bin_2x8;
_viv_uniform int inputZP0; _viv_uniform int input0_zp;
_viv_uniform int inputZP1; _viv_uniform int input1_zp;
_viv_uniform float input_scale0; _viv_uniform float input0_scale;
_viv_uniform float input_scale1; _viv_uniform float input1_scale;
_viv_uniform float outputZP; _viv_uniform float output_zp;
#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \ #define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
__kernel void prelu_##name0##to##name1( \ __kernel void prelu_##name( \
__read_only image2d_array_t input0, \ __read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \ __read_only image2d_array_t input1, \
__write_only image2d_array_t output) \ __write_only image2d_array_t output) \
{\ {\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
vxc_float4 vecA, vecB, vecC, vecD;\ float4 vecA, vecB, vecC, vecD;\
input_type0 srcA;\ input_type0 srcA;\
copy_type0 src0;\ copy_type0 src0;\
vxc_short8 srcB;\ vxc_short8 srcB;\
vxc_half8 src1;\ vxc_half8 src1;\
input_type0 input_ZP;\ input_type0 zp;\
VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
_viv_asm(COPY, src0, srcA, 16); \ _viv_asm(COPY, src0, srcA, 16); \
VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
_viv_asm(COPY, src1, srcB, 16); \ _viv_asm(COPY, src1, srcB, 16); \
\ \
_viv_asm(COPY, input_ZP, inputZP0, 4);\ _viv_asm(COPY, zp, input0_zp, 4);\
VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \ VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
uniDataSubZPtoFp32Part0_4x4); \ uniDataSubZPtoFp32Part0_4x4); \
VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \ VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
uniDataSubZPtoFp32Part1_4x4);\ uniDataSubZPtoFp32Part1_4x4);\
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\ VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\ VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
\ \
vecA = vecA * input_scale0;\ vecA = vecA * input0_scale;\
vecB = vecB * input_scale0;\ vecB = vecB * input0_scale;\
vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \ float4 maxData0 = vecA > 0 ? vecA : 0.0; \
vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \ float4 maxData1 = vecB > 0 ? vecB : 0.0; \
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \ float4 minData0 = vecA < 0 ? vecA : 0.0; \
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \ float4 minData1 = vecB < 0 ? vecB : 0.0; \
vecA = maxData0 + vecC * minData0 + outputZP;\ vecA = maxData0 + vecC * minData0 + output_zp;\
vecB = maxData1 + vecD * minData1 + outputZP;\ vecB = maxData1 + vecD * minData1 + output_zp;\
convert_type dst0, dst1;\ convert_type dst0, dst1;\
_viv_asm(CONV_RTE, dst0, vecA);\ _viv_asm(CONV_RTE, dst0, vecA);\
_viv_asm(CONV_RTE, dst1, vecB);\ _viv_asm(CONV_RTE, dst1, vecB);\
@ -164,49 +164,49 @@ _viv_uniform float outputZP;
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
} }
// name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type // name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type
PRELU_F16_3D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16) PRELU_F16_3D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
PRELU_F16_3D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8) PRELU_F16_3D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8) PRELU_F16_3D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
PRELU_F16_3D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8) PRELU_F16_3D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) PRELU_F16_3D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
PRELU_F16_3D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) PRELU_F16_3D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8) PRELU_F16_3D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16) PRELU_F16_3D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
PRELU_F16_3D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8) PRELU_F16_3D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16) PRELU_F16_3D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \ #define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
__kernel void prelu_##name0##to##name1##_2D( \ __kernel void prelu_##name##_2D( \
__read_only image2d_array_t input0, \ __read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \ __read_only image2d_array_t input1, \
__write_only image2d_array_t output) \ __write_only image2d_array_t output) \
{\ {\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\
vxc_float4 vecA, vecB, vecC, vecD;\ float4 vecA, vecB, vecC, vecD;\
input_type0 srcA;\ input_type0 srcA;\
copy_type0 src0;\ copy_type0 src0;\
vxc_short8 srcB;\ vxc_short8 srcB;\
vxc_half8 src1;\ vxc_half8 src1;\
input_type0 input_ZP;\ input_type0 zp;\
VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
_viv_asm(COPY, src0, srcA, 16); \ _viv_asm(COPY, src0, srcA, 16); \
VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
_viv_asm(COPY, src1, srcB, 16); \ _viv_asm(COPY, src1, srcB, 16); \
\ \
_viv_asm(COPY, input_ZP, inputZP0, 4);\ _viv_asm(COPY, zp, input0_zp, 4);\
VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\ VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\ VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
\ \
vecA = vecA * input_scale0;\ vecA = vecA * input0_scale;\
vecB = vecB * input_scale0;\ vecB = vecB * input0_scale;\
vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \ float4 maxData0 = vecA > 0 ? vecA : 0.0; \
vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \ float4 maxData1 = vecB > 0 ? vecB : 0.0; \
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \ float4 minData0 = vecA < 0 ? vecA : 0.0; \
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \ float4 minData1 = vecB < 0 ? vecB : 0.0; \
vecA = maxData0 + vecC * minData0 + outputZP;\ vecA = maxData0 + vecC * minData0 + output_zp;\
vecB = maxData1 + vecD * minData1 + outputZP;\ vecB = maxData1 + vecD * minData1 + output_zp;\
convert_type dst0, dst1;\ convert_type dst0, dst1;\
_viv_asm(CONV_RTE, dst0, vecA);\ _viv_asm(CONV_RTE, dst0, vecA);\
_viv_asm(CONV_RTE, dst1, vecB);\ _viv_asm(CONV_RTE, dst1, vecB);\
@ -216,49 +216,49 @@ PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha
_viv_asm(COPY, dst, dst2, 16); \ _viv_asm(COPY, dst, dst2, 16); \
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
} }
PRELU_F16_2D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8) PRELU_F16_2D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16) PRELU_F16_2D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
PRELU_F16_2D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8) PRELU_F16_2D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16) PRELU_F16_2D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
PRELU_F16_2D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8) PRELU_F16_2D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8) PRELU_F16_2D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16) PRELU_F16_2D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
PRELU_F16_2D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8) PRELU_F16_2D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
PRELU_F16_2D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8) PRELU_F16_2D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16) PRELU_F16_2D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \ #define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \
__kernel void prelu_U8U8to##name##_2D( \ __kernel void prelu_##name##_2D( \
__read_only image2d_array_t input0, \ __read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \ __read_only image2d_array_t input1, \
__write_only image2d_array_t output) \ __write_only image2d_array_t output) \
{\ {\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\
vxc_float4 vecA, vecB, vecC, vecD;\ float4 vecA, vecB, vecC, vecD;\
vxc_uchar16 src0;\ src0_type src0;\
vxc_uchar16 src1;\ src1_type src1;\
vxc_uchar16 input_ZP0;\ short zp0;\
vxc_uchar16 input_ZP1;\ short zp1;\
VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
\ \
_viv_asm(COPY, input_ZP0, inputZP0, 4);\ _viv_asm(COPY, zp0, input0_zp, 2);\
VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
_viv_asm(COPY, input_ZP1, inputZP1, 4);\ _viv_asm(COPY, zp1, input1_zp, 4);\
VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\ VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\ VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
\ \
vecA = vecA * input_scale0;\ vecA = vecA * input0_scale;\
vecB = vecB * input_scale0;\ vecB = vecB * input0_scale;\
vecC = vecC * input_scale1;\ vecC = vecC * input1_scale;\
vecD = vecD * input_scale1;\ vecD = vecD * input1_scale;\
vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \ float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \ float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \ float4 minData0 = vecA < 0 ? vecA : 0.0; \
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \ float4 minData1 = vecB < 0 ? vecB : 0.0; \
vecA = maxData0 + vecC * minData0 + outputZP;\ vecA = maxData0 + vecC * minData0 + output_zp;\
vecB = maxData1 + vecD * minData1 + outputZP;\ vecB = maxData1 + vecD * minData1 + output_zp;\
convert_type dst0, dst1;\ convert_type dst0, dst1;\
_viv_asm(CONV_RTE, dst0, vecA);\ _viv_asm(CONV_RTE, dst0, vecA);\
_viv_asm(CONV_RTE, dst1, vecB);\ _viv_asm(CONV_RTE, dst1, vecB);\
@ -268,7 +268,8 @@ PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha
_viv_asm(COPY, dst, dst2, 16); \ _viv_asm(COPY, dst, dst2, 16); \
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
} }
PRELU_U8_2D(U8, vxc_uchar16, int4, vxc_uchar16) PRELU_INTEGER_2D(U8U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
PRELU_U8_2D(F16, vxc_half8, half4, vxc_short8) PRELU_INTEGER_2D(U8U8toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)

View File

@ -0,0 +1,181 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;
_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;
_viv_uniform int out_height;
__kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int align_corners,
int half_pixel_centers
)
{
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
coord_in.x = (coord_out.x * 2 - 1) >> 2;
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
vxc_uchar16 in0, in1, tmp, result;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
vxc_ushort8 dst0;
while (coord_out.y < out_height)
{
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.y += 2;
coord_out.y++;
}
}
_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;
_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;
_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;
_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;
__kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int align_corners,
int half_pixel_centers
)
{
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
coord_in.x = (coord_out.x * 2 - 3) >> 3;
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
vxc_uchar16 in0, in1, dst0, dst1;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
vxc_ushort8 tmp;
while (coord_out.y < out_height)
{
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.y += 2;
coord_out.y++;
}
}

View File

@ -0,0 +1,102 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;
_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;
_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;
_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;
_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;
_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;
__kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int align_corners,
int half_pixel_centers
)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;
vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
vxc_ushort8 data;
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,167 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
_viv_uniform int out_height;
_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;
__kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int align_corners,
int half_pixel_centers
)
{
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
coord_in.x = (coord_out.x * 2 - 7) >> 4;
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
vxc_ushort8 tmp;
while (coord_out.y < out_height)
{
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.y += 2;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
}
}

View File

@ -0,0 +1,303 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float scale0;
_viv_uniform float scale1;
_viv_uniform float output_zp;
_viv_uniform int half_head_size;
_viv_uniform VXC_512Bits uniATimesB_0_4x4;
_viv_uniform VXC_512Bits uniATimesB_1_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
#define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \
__kernel void rope_##name##_bnhs \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
int4 coord_out = coord_in; \
\
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
\
src_type data0, data1; \
src1_type cos, sin; \
copy_type v0, v1; \
dst_type dst; \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord_in.y += half_head_size; \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 data2, data3, data4, data5; \
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale0 - data5 * scale1 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
data2 = data2 * scale1 + data4 * scale0 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.y += half_head_size; \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
ROPE_BNHS_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
ROPE_BNHS_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
ROPE_BNHS_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
ROPE_BNHS_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
__kernel void rope_F16_F16toF16_bnhs
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_out = coord_in;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
vxc_short8 v0, v1, v2, v3, dst;
vxc_half8 data0, data1, cos, sin, dst2;
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data0, v0, 16);
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, cos, v1, 16);
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, sin, v2, 16);
coord_in.y += half_head_size;
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data1, v3, 16);
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 data2, data3, data4, data5;
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
data2 = data2 - data4;
data3 = data3 - data5;
half4 dst0;
half4 dst1;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
data2 = data2 * scale1 + data4 * scale0 + output_zp;
data3 = data3 * scale1 + data5 * scale0 + output_zp;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
coord_out.y += half_head_size;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform int in0_zp;
_viv_uniform int cos_zp;
_viv_uniform int sin_zp;
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
#define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \
__kernel void rope_##name##_bnhs \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
int4 coord_out = coord_in; \
\
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
\
dtype data0, data1, dst; \
src1_type cos, sin; \
copy_type v0, v1; \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord_in.y += half_head_size; \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 l00, l01, cos0, cos1; \
float4 l10, l11, sin0, sin1; \
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.y += half_head_size; \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_ASYM_BNHS(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
ROPE_ASYM_BNHS(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
ROPE_ASYM_BNHS(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
ROPE_ASYM_BNHS(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void rope_BF16_BF16toBF16_bnhs
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_out = coord_in;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
vxc_ushort8 v0, v1, v2, v3, dst;
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.y += half_head_size;
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_short8 data;
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src0, data, 16);
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, cos0, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, cos1, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, sin0, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, sin1, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src2, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src3, data, 16);
float4 data0 = src0 * cos0 - src2 * sin0;
float4 data1 = src1 * cos1 - src3 * sin1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
data0 = src0 * sin0 + src2 * cos0;
data1 = src1 * sin1 + src3 * cos1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
coord_out.y += half_head_size;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,245 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float scale0;
_viv_uniform float scale1;
_viv_uniform float output_zp;
_viv_uniform int half_head_size;
_viv_uniform VXC_512Bits uniATimesB_0_4x4;
_viv_uniform VXC_512Bits uniATimesB_1_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
#define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \
__kernel void rope_##name##_bnh1 \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
src_type data0, data1; \
src1_type cos, sin; \
copy_type v0, v1; \
VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord.x += half_head_size; \
VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
float4 data2, data3, data4, data5; \
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale0 - data5 * scale1 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
dst_type dst; \
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
data2 = data2 * scale1 + data4 * scale0 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
ROPE_BNH1_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
ROPE_BNH1_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
ROPE_BNH1_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
ROPE_BNH1_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
__kernel void rope_F16_F16toF16_bnh1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
vxc_short8 v0, v1, v2, v3, dst;
vxc_half8 data0, data1, cos, sin, dst2;
VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data0, v0, 16);
VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, cos, v1, 16);
VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, sin, v2, 16);
coord.x += half_head_size;
VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data1, v3, 16);
float4 data2, data3, data4, data5;
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
data2 = data2 - data4;
data3 = data3 - data5;
half4 dst0;
half4 dst1;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
data2 = data2 + data4;
data3 = data3 + data5;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
_viv_uniform int in0_zp;
_viv_uniform int cos_zp;
_viv_uniform int sin_zp;
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
#define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \
__kernel void rope_##name##_bnh1 \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
dtype data0, data1, dst; \
src1_type cos, sin; \
copy_type v0, v1; \
VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord.x += half_head_size; \
VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
float4 l00, l01, cos0, cos1; \
float4 l10, l11, sin0, sin1; \
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
ROPE_ASYM_BNH1(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
ROPE_ASYM_BNH1(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
ROPE_ASYM_BNH1(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
ROPE_ASYM_BNH1(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void rope_BF16_BF16toBF16_bnh1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
vxc_ushort8 v0, v1, v2, v3, dst;
VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.x += half_head_size;
VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_short8 data;
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src0, data, 16);
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, cos0, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, cos1, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, sin0, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, sin1, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src2, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src3, data, 16);
float4 data0 = src0 * cos0 - src2 * sin0;
float4 data1 = src1 * cos1 - src3 * sin1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
data0 = src0 * sin0 + src2 * cos0;
data1 = src1 * sin1 + src3 * cos1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,312 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float scale0;
_viv_uniform float scale1;
_viv_uniform float output_zp;
_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
#define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
__kernel void rope_##name##_bsnh \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
\
src_type data0, data1; \
src1_type cos, sin; \
copy_type v0, v1; \
dst_type dst; \
VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
\
coord_in.x *= 2; \
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int4 coord_out = coord_in; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 data2, data3, data4, data5; \
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.x += 8; \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
ROPE_BSNH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
ROPE_BSNH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
ROPE_BSNH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
ROPE_BSNH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
__kernel void rope_F16_F16toF16_bsnh
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
vxc_short8 v0, v1, v2, v3, dst;
vxc_half8 data0, data1, cos, sin, dst2;
VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, cos, v1, 16);
VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, sin, v2, 16);
coord_in.x *= 2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data0, v0, 16);
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data1, v3, 16);
int4 coord_out = coord_in;
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 data2, data3, data4, data5;
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
data2 = data2 - data4;
data3 = data3 + data5;
half4 dst0;
half4 dst1;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
data2 = data2 - data4;
data3 = data3 + data5;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
coord_out.x += 8;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform int in0_zp;
_viv_uniform int cos_zp;
_viv_uniform int sin_zp;
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
#define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \
__kernel void rope_##name##_bsnh \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
\
dtype data0, data1, dst; \
src1_type cos, sin; \
copy_type v0, v1; \
\
VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord_in.x *= 2; \
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int4 coord_out = coord_in; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 l00, l01, cos0, cos1; \
float4 l10, l11, sin0, sin1; \
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.x += 8; \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_ASYM_BSNH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
ROPE_ASYM_BSNH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
ROPE_ASYM_BSNH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
ROPE_ASYM_BSNH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void rope_BF16_BF16toBF16_bsnh
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
vxc_ushort8 v0, v1, v2, v3, dst;
VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.x *= 2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int4 coord_out = coord_in;
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_short8 data;
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src0, data, 16);
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, cos0, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, cos1, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, sin0, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, sin1, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src2, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src3, data, 16);
float4 even = (float4)(src0.xz, src1.xz);
float4 odd = (float4)(src0.yw, src1.yw);
float4 data0 = even * cos0 - odd * sin0;
float4 data1 = even * sin0 + odd * cos0;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
even = (float4)(src2.xz, src3.xz);
odd = (float4)(src2.yw, src3.yw);
data0 = even * cos1 - odd * sin1;
data1 = even * sin1 + odd * cos1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
coord_out.x += 8;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,312 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float scale0;
_viv_uniform float scale1;
_viv_uniform float output_zp;
_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
#define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
__kernel void rope_##name##_bnsh \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
\
src_type data0, data1; \
src1_type cos, sin; \
copy_type v0, v1; \
dst_type dst; \
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
\
coord_in.x *= 2; \
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int4 coord_out = coord_in; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 data2, data3, data4, data5; \
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.x += 8; \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
ROPE_BNSH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
ROPE_BNSH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
ROPE_BNSH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
ROPE_BNSH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
__kernel void rope_F16_F16toF16_bnsh
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
vxc_short8 v0, v1, v2, v3, dst;
vxc_half8 data0, data1, cos, sin, dst2;
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, cos, v1, 16);
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, sin, v2, 16);
coord_in.x *= 2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data0, v0, 16);
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data1, v3, 16);
int4 coord_out = coord_in;
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 data2, data3, data4, data5;
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
data2 = data2 - data4;
data3 = data3 + data5;
half4 dst0;
half4 dst1;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
data2 = data2 - data4;
data3 = data3 + data5;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
coord_out.x += 8;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform int in0_zp;
_viv_uniform int cos_zp;
_viv_uniform int sin_zp;
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
#define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \
__kernel void rope_##name##_bnsh \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
\
dtype data0, data1, dst; \
src1_type cos, sin; \
copy_type v0, v1; \
\
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord_in.x *= 2; \
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int4 coord_out = coord_in; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 l00, l01, cos0, cos1; \
float4 l10, l11, sin0, sin1; \
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.x += 8; \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_ASYM_BNSH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
ROPE_ASYM_BNSH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
ROPE_ASYM_BNSH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
ROPE_ASYM_BNSH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void rope_BF16_BF16toBF16_bnsh
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
vxc_ushort8 v0, v1, v2, v3, dst;
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.x *= 2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int4 coord_out = coord_in;
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_short8 data;
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src0, data, 16);
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, cos0, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, cos1, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, sin0, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, sin1, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src2, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src3, data, 16);
float4 even = (float4)(src0.xz, src1.xz);
float4 odd = (float4)(src0.yw, src1.yw);
float4 data0 = even * cos0 - odd * sin0;
float4 data1 = even * sin0 + odd * cos0;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
even = (float4)(src2.xz, src3.xz);
odd = (float4)(src2.yw, src3.yw);
data0 = even * cos1 - odd * sin1;
data1 = even * sin1 + odd * cos1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
coord_out.x += 8;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -93,3 +93,101 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \
} }
SCATTER_ND_UPDATE_COPY2OUT(U8, vxc_uchar16, 1) SCATTER_ND_UPDATE_COPY2OUT(U8, vxc_uchar16, 1)
SCATTER_ND_UPDATE_COPY2OUT(I8, vxc_char16, 1) SCATTER_ND_UPDATE_COPY2OUT(I8, vxc_char16, 1)
SCATTER_ND_UPDATE_COPY2OUT(U16, vxc_ushort8, 2)
SCATTER_ND_UPDATE_COPY2OUT(I16, vxc_short8, 2)
#define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \
__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \
__read_only image2d_t input_ref, \
image2d_t temp_ref, \
image2d_t output0 \
) \
{ \
int gidx = get_global_id(0); \
Image img0 = create_image_from_image2d(input_ref, 2); \
Image img1 = create_image_from_image2d(temp_ref, 2); \
__global data_type* in_ptr = (__global data_type*)img0.ptr; \
__global data_type* out_ptr = (__global data_type*)img1.ptr; \
data_type src, dst; \
src = in_ptr[gidx]; \
vxc_ushort8 mp0; \
_viv_asm(COPY, mp0, multAndoutZP0, 16); \
VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift0_Lo_2x8); \
out_ptr[gidx] = dst; \
}
SCATTER_ND_UPDATE_REF2OUT_16BITS(U16, vxc_ushort8)
SCATTER_ND_UPDATE_REF2OUT_16BITS(I16, vxc_short8)
#define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \
__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \
__read_only image2d_t input_index, \
__read_only image2d_t input_update, \
image2d_t temp_ref, \
image2d_t input0, \
image2d_t output1, \
int width, int area, int vol, int coord_dim \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
\
Image img1 = create_image_from_image2d(input_index, 4); \
Image img2 = create_image_from_image2d(input_update, 2); \
Image img3 = create_image_from_image2d(temp_ref, 2); \
__global int* index_ptr = (__global int*)img1.ptr; \
__global data_type* update_ptr = (__global data_type*)img2.ptr; \
__global data_type* output_ptr = (__global data_type*)img3.ptr; \
data_type dst; \
\
int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \
data_type src = update_ptr[gidy * update_width + gidx]; \
int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
int loc = idx * output_width + gidx; \
vxc_ushort8 mp1; \
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift1_Lo_2x8); \
output_ptr[loc] = dst; \
}
SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16, vxc_ushort8)
SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16, vxc_short8)
__kernel void scatter_nd_update_ref2out_F16toF16(
__read_only image2d_t input_ref,
image2d_t temp_ref,
image2d_t output0
)
{
int gidx = get_global_id(0);
Image img0 = create_image_from_image2d(input_ref, 2);
Image img1 = create_image_from_image2d(temp_ref, 2);
__global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr;
__global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr;
out_ptr[gidx] = in_ptr[gidx];
}
__kernel void scatter_nd_update_update2ref_F16toF16_16x(
__read_only image2d_t input_index,
__read_only image2d_t input_update,
image2d_t temp_ref,
image2d_t input0,
image2d_t output1,
int width, int area, int vol, int coord_dim
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
Image img1 = create_image_from_image2d(input_index, 4);
Image img2 = create_image_from_image2d(input_update, 2);
Image img3 = create_image_from_image2d(temp_ref, 2);
__global int* index_ptr = (__global int*)img1.ptr;
__global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr;
__global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr;
int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx);
int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;
int loc = idx * output_width + gidx;
output_ptr[loc] = update_ptr[gidy * update_width + gidx];
}

File diff suppressed because it is too large Load Diff

View File

@ -29,6 +29,7 @@
#include "VX/vx_ext_program.h" #include "VX/vx_ext_program.h"
#include "vsi_nn_platform.h" #include "vsi_nn_platform.h"
#include "vsi_nn_prv.h" #include "vsi_nn_prv.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
@ -198,10 +199,11 @@ static vsi_status vsi_nn_RegisterVXKernel
vx_size * program_len = NULL; vx_size * program_len = NULL;
const char **program_src = NULL; const char **program_src = NULL;
vx_context ctx = NULL; vx_context ctx = NULL;
vsi_nn_context_t context = NULL;
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index]; vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
uint8_t i = 0; uint8_t i = 0;
vsi_bool load_from_file = FALSE; vsi_bool load_from_file = FALSE;
vsi_nn_runtime_option_t* options;
options = ((vsi_nn_graph_prv_t*)graph)->options;
#define MAX_BUILDPROGRAM_LEN 128 #define MAX_BUILDPROGRAM_LEN 128
char cmd[MAX_BUILDPROGRAM_LEN] = {0}; char cmd[MAX_BUILDPROGRAM_LEN] = {0};
@ -210,8 +212,7 @@ static vsi_status vsi_nn_RegisterVXKernel
memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN); memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN);
status = VSI_FAILURE; status = VSI_FAILURE;
ctx = vxGetContext( (vx_reference)graph->g ); ctx = vxGetContext( (vx_reference)graph->g );
context = graph->ctx; evis = options->config.evis.ver;
evis = context->config.evis.ver;
program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *)); program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final ); CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
@ -244,12 +245,12 @@ static vsi_status vsi_nn_RegisterVXKernel
{ {
// set default evis version is 2 // set default evis version is 2
snprintf(cmd, MAX_BUILDPROGRAM_LEN, snprintf(cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
} }
else else
{ {
snprintf(cmd, MAX_BUILDPROGRAM_LEN, snprintf(cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
} }
status = vxBuildProgram(program, cmd); status = vxBuildProgram(program, cmd);
@ -302,7 +303,7 @@ static vsi_status vsi_nn_RegisterBinKernel
vx_size program_len = 0; vx_size program_len = 0;
const uint8_t *program_ptr = NULL; const uint8_t *program_ptr = NULL;
vx_context ctx; vx_context ctx;
vsi_nn_context_t context; vsi_nn_runtime_option_t* options;
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index]; vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
#define MAX_BUILDPROGRAM_LEN 128 #define MAX_BUILDPROGRAM_LEN 128
@ -313,8 +314,8 @@ static vsi_status vsi_nn_RegisterBinKernel
status = VSI_FAILURE; status = VSI_FAILURE;
ctx = vxGetContext( (vx_reference)graph->g ); ctx = vxGetContext( (vx_reference)graph->g );
context = graph->ctx; options = ((vsi_nn_graph_prv_t*)graph)->options;
evis = context->config.evis.ver; evis = options->config.evis.ver;
program_ptr = vsi_nn_VxBinResourceGetResource( program_ptr = vsi_nn_VxBinResourceGetResource(
kernel_info->resource_name[kernel_info->resource_num - 1], &program_len); kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
@ -337,12 +338,12 @@ static vsi_status vsi_nn_RegisterBinKernel
{ {
// set default evis version is 2 // set default evis version is 2
snprintf(cmd, MAX_BUILDPROGRAM_LEN, snprintf(cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va); "-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
} }
else else
{ {
snprintf(cmd, MAX_BUILDPROGRAM_LEN, snprintf(cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va); "-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
} }
#else #else
snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension"); snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension");

View File

@ -35,6 +35,8 @@
#include "utils/vsi_nn_constraint_check.h" #include "utils/vsi_nn_constraint_check.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h" #include "kernel/vsi_nn_kernel_eltwise.h"
#include "vsi_nn_tensor_util_prv.h"
#include "vsi_nn_error.h"
static vsi_status _try_set_high_presision_tensor static vsi_status _try_set_high_presision_tensor
( (
@ -120,9 +122,22 @@ static vsi_status _static_batchnorm
vsi_nn_tensor_t ** outputs vsi_nn_tensor_t ** outputs
) )
{ {
#define _TENSOR_LEN 64
vsi_status status; vsi_status status;
vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_param_t * param = NULL;
vsi_nn_tensor_t* reshape_tensors[6] = { NULL }; vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
uint32_t new_rank = 4;
vsi_nn_tensor_t* input0 = NULL;
vsi_nn_tensor_t* output = NULL;
char reshape0_tensor_name[_TENSOR_LEN];
char reshape1_tensor_name[_TENSOR_LEN];
char batch_norm_tensor_name[_TENSOR_LEN];
memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
memset(batch_norm_tensor_name, 0, sizeof(batch_norm_tensor_name));
status = VSI_FAILURE; status = VSI_FAILURE;
status = _try_set_high_presision_tensor(inputs); status = _try_set_high_presision_tensor(inputs);
@ -133,8 +148,41 @@ static vsi_status _static_batchnorm
} }
if (_require_reshape(self, inputs)) if (_require_reshape(self, inputs))
{ {
reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input; if (3 == inputs[0]->attr.dim_num)
reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output; {
shape[0] = inputs[0]->attr.size[0];
shape[1] = 1;
shape[2] = inputs[0]->attr.size[1];
shape[3] = inputs[0]->attr.size[2];
}
else if (5 == inputs[0]->attr.dim_num)
{
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
shape[1] = inputs[0]->attr.size[2];
shape[2] = inputs[0]->attr.size[3];
shape[3] = inputs[0]->attr.size[4];
}
input0 = vsi_nn_kernel_insert_reshape_node(self->graph,
inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD);
CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
reshape_tensors[0] = input0;
snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
if (vxSetReferenceName((vx_reference)reshape_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
goto final;
}
output = vsi_nn_kernel_insert_reshape_node(self->graph,
outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD);
CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
reshape_tensors[5] = output;
snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
if (vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
goto final;
}
} }
else else
{ {
@ -162,6 +210,20 @@ static vsi_status _static_batchnorm
vsi_nn_kernel_param_release(&param); vsi_nn_kernel_param_release(&param);
if (output)
{
snprintf(batch_norm_tensor_name, sizeof(batch_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
if (vxSetReferenceName((vx_reference)output->t, batch_norm_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
goto final;
}
}
final:
vsi_safe_release_tensor(input0);
vsi_safe_release_tensor(output);
return status; return status;
} }
@ -313,68 +375,6 @@ static vsi_status op_compute
return status; return status;
} /* op_compute() */ } /* op_compute() */
static vsi_status op_optimize
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
vsi_nn_opt_direction_e direction
)
{
uint32_t dim = 0;
vsi_nn_batcnnorm_lcl_data *local = NULL;
vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
char tensor_name[128];
dim = inputs[0]->attr.dim_num;
if(_require_reshape(self, inputs) == FALSE)
{
return VSI_SUCCESS;
}
VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
/*
reshape 3d input (xcn) --> 4d input (whcn)
reshape 3d output(xcn) --> 4d output(whcn)
*/
dim = 4;
if (3 == inputs[0]->attr.dim_num)
{
shape[0] = inputs[0]->attr.size[0];
shape[1] = 1;
shape[2] = inputs[0]->attr.size[1];
shape[3] = inputs[0]->attr.size[2];
}
else if (5 == inputs[0]->attr.dim_num)
{
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
shape[1] = inputs[0]->attr.size[2];
shape[2] = inputs[0]->attr.size[3];
shape[3] = inputs[0]->attr.size[4];
}
local = self->nn_param.batch_norm.local;
if (VSI_NN_OPTIMIZE_BACKWARD == direction)
{
local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
}
else
{
local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
if(local->reshaped_output && local->reshaped_output->t)
{
memset(tensor_name, 0, sizeof(tensor_name));
snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
return VSI_FAILURE;
}
}
}
return VSI_SUCCESS;
} /* op_optimize() */
static vsi_bool _dynamic_check static vsi_bool _dynamic_check
( (
vsi_nn_node_t * self, vsi_nn_node_t * self,
@ -494,58 +494,6 @@ static vsi_bool op_check
} }
} /* op_check() */ } /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_nn_batcnnorm_lcl_data *local = NULL;
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
}
if(_require_reshape(self, inputs))
{
local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data));
if(NULL == local)
{
return VSI_FAILURE;
}
memset(local, 0, sizeof(vsi_nn_batcnnorm_lcl_data));
self->nn_param.batch_norm.local = local;
}
return TRUE;
} /* op_setup() */
static vsi_status op_deinit
(
vsi_nn_node_t * self
)
{
vsi_nn_batch_norm_param *p = &(self->nn_param.batch_norm);
if(p->local)
{
if (p->local->reshaped_input)
{
vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
p->local->reshaped_input = NULL;
}
if (p->local->reshaped_output)
{
vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
p->local->reshaped_output = NULL;
}
vsi_nn_safe_free(p->local);
}
vsi_nn_op_common_deinit(self);
return VSI_SUCCESS;
}
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -555,10 +503,10 @@ DEF_OP_REG
/* op_name */ BATCH_NORM, /* op_name */ BATCH_NORM,
/* init */ NULL, /* init */ NULL,
/* compute */ op_compute, /* compute */ op_compute,
/* deinit */ op_deinit, /* deinit */ vsi_nn_op_common_deinit,
/* check */ op_check, /* check */ op_check,
/* setup */ op_setup, /* setup */ vsi_nn_op_common_setup,
/* optimize */ op_optimize, /* optimize */ NULL,
/* input_num */ 5, /* input_num */ 5,
/* output_num */ 1 /* output_num */ 1
); );

View File

@ -118,6 +118,7 @@ static vsi_bool op_setup
if (outputs[0]->attr.dim_num == 0) if (outputs[0]->attr.dim_num == 0)
{ {
outputs[0]->attr.size[0] = 1; outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE); vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
} }
else else

View File

@ -82,6 +82,7 @@ static vsi_bool op_check
{ {
BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1) BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1)
IO_TYPE(D_U32, D_U32) IO_TYPE(D_U32, D_U32)
IO_TYPE(D_I32, D_I32)
IO_TYPE(D_F32, D_F32) IO_TYPE(D_F32, D_F32)
IO_TYPE(D_F16, D_F16) IO_TYPE(D_F16, D_F16)
IO_TYPE(D_BF16, D_BF16) IO_TYPE(D_BF16, D_BF16)

View File

@ -253,6 +253,7 @@ static vsi_bool op_check
IO_TYPE(D_BOOL8, D_I32) IO_TYPE(D_BOOL8, D_I32)
IO_TYPE(D_BOOL8, D_U16) IO_TYPE(D_BOOL8, D_U16)
IO_TYPE(D_BOOL8, D_U32) IO_TYPE(D_BOOL8, D_U32)
IO_TYPE(D_BOOL8, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_BOOL8) IO_TYPE(D_U8|Q_ASYM, D_BOOL8)
IO_TYPE(D_I8|Q_ASYM, D_BOOL8) IO_TYPE(D_I8|Q_ASYM, D_BOOL8)
IO_TYPE(D_I8|Q_DFP, D_BOOL8) IO_TYPE(D_I8|Q_DFP, D_BOOL8)

View File

@ -155,10 +155,10 @@ vsi_bool vsi_nn_op_eltwise_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE; vsi_bool ret = TRUE;
out_rank = inputs[0]->attr.dim_num; out_rank = vsi_nn_get_tensor_dims(inputs[0]);
for ( i = 1; i < self->input.num; i++) for ( i = 1; i < self->input.num; i++)
{ {
in2_rank = inputs[i]->attr.dim_num; in2_rank = vsi_nn_get_tensor_dims(inputs[i]);
out_rank = vsi_nn_max( out_rank, in2_rank ); out_rank = vsi_nn_max( out_rank, in2_rank );
} }
@ -166,10 +166,10 @@ vsi_bool vsi_nn_op_eltwise_setup
{ {
vsi_size_t sz0, sz1; vsi_size_t sz0, sz1;
sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1; sz0 = i < vsi_nn_get_tensor_dims(inputs[0]) ? inputs[0]->attr.size[i] : 1;
for ( j = 1; j < self->input.num; j++) for ( j = 1; j < self->input.num; j++)
{ {
sz1 = i < inputs[j]->attr.dim_num ? inputs[j]->attr.size[i] : 1; sz1 = i < vsi_nn_get_tensor_dims(inputs[j]) ? inputs[j]->attr.size[i] : 1;
sz0 = vsi_nn_max( sz0, sz1 ); sz0 = vsi_nn_max( sz0, sz1 );
if (sz0 != sz1 && sz0 != 1 && sz1 != 1) if (sz0 != sz1 && sz0 != 1 && sz1 != 1)
{ {
@ -187,11 +187,12 @@ vsi_bool vsi_nn_op_eltwise_setup
{ {
outputs[0]->attr.dim_num = out_rank; outputs[0]->attr.dim_num = out_rank;
memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
if (out_rank == 1 && if (vsi_nn_GetTensorIsScalar(inputs[0]) &&
vsi_nn_GetTensorIsScalar(inputs[0]) &&
vsi_nn_GetTensorIsScalar(inputs[1])) vsi_nn_GetTensorIsScalar(inputs[1]))
{ {
vsi_nn_SetTensorIsScalar(outputs[0], TRUE); vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
} }
} }
else else

View File

@ -199,6 +199,7 @@ static vsi_bool op_setup
if (o_rank == 0) if (o_rank == 0)
{ {
outputs[0]->attr.size[0] = 1; outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE); vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
} }
else else

View File

@ -306,6 +306,8 @@ static vsi_bool _op_check
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM) IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM) IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM)
IO_TYPE(D_U16|Q_ASYM, D_F32, D_F32, D_U16|Q_ASYM)
IO_TYPE(D_U16|Q_SYM, D_F32, D_F32, D_U16|Q_SYM)
END_IO_TYPE_DECL(GROUP_NORM) END_IO_TYPE_DECL(GROUP_NORM)
if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num)) if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
{ {

View File

@ -25,6 +25,7 @@
#include <stdlib.h> #include <stdlib.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h" #include "vsi_nn_platform.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
@ -197,6 +198,7 @@ static vsi_bool op_setup_default
vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL }; vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
vsi_nn_internal_tensor_t * h_times_r = NULL; vsi_nn_internal_tensor_t * h_times_r = NULL;
vsi_nn_tensor_attr_t attr; vsi_nn_tensor_attr_t attr;
vsi_nn_activation_e recurrent_activation = p->recurrent_activation;
vsi_nn_internal_init_node_wksp( self ); vsi_nn_internal_init_node_wksp( self );
@ -230,7 +232,8 @@ static vsi_bool op_setup_default
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
self->graph->ctx->config.support_stream_processor) (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor &&
recurrent_activation == VSI_NN_ACT_SIGMOID))
{ {
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
} }

View File

@ -93,37 +93,15 @@ static vsi_bool op_check
{ {
BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1) BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32) IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F16) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_DFP) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_ASYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_SYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_SYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_SYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_DFP) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_ASYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_SYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_SYM) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_SYM)
IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_BF16) IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_I16|Q_SYM)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_I16|Q_DFP)

View File

@ -25,6 +25,7 @@
#include <stdlib.h> #include <stdlib.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h" #include "vsi_nn_platform.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_node.h" #include "vsi_nn_node.h"
@ -351,7 +352,7 @@ static vsi_bool op_setup
} }
else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 && else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
self->graph->ctx->config.support_stream_processor ) ((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
{ {
vsi_nn_internal_tensor_t* output_tensor = NULL; vsi_nn_internal_tensor_t* output_tensor = NULL;
vsi_nn_internal_tensor_t* reshape_tensor = NULL; vsi_nn_internal_tensor_t* reshape_tensor = NULL;

View File

@ -106,7 +106,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_node_wksp( self ); vsi_nn_internal_init_node_wksp( self );
if ( axis != 0 && !self->graph->ctx->config.support_stream_processor) if ( axis != 0 && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
{ {
vsi_nn_internal_tensor_t* mean_tensor = NULL; vsi_nn_internal_tensor_t* mean_tensor = NULL;
vsi_nn_internal_tensor_t* vari_tensor = NULL; vsi_nn_internal_tensor_t* vari_tensor = NULL;

View File

@ -25,6 +25,7 @@
#include <stdlib.h> #include <stdlib.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h" #include "vsi_nn_platform.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
@ -139,7 +140,7 @@ static vsi_bool op_setup
p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL; p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL;
p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL; p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL;
if (self->graph->ctx->config.support_stream_processor) if (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
{ {
p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL; p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL;
} }

View File

@ -100,6 +100,7 @@ static vsi_bool op_check
IO_TYPE(D_I32, D_I16|Q_ASYM) IO_TYPE(D_I32, D_I16|Q_ASYM)
IO_TYPE(D_I32, D_I16|Q_SYM) IO_TYPE(D_I32, D_I16|Q_SYM)
IO_TYPE(D_I32, D_I32) IO_TYPE(D_I32, D_I32)
IO_TYPE(D_I32, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM)
IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM)
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP)
@ -111,8 +112,10 @@ static vsi_bool op_check
IO_TYPE(D_U8|Q_ASYM, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_BF16)
IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
IO_TYPE(D_I8|Q_ASYM, D_F16) IO_TYPE(D_I8|Q_ASYM, D_F16)
IO_TYPE(D_I8|Q_ASYM, D_BF16)
IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_DFP, D_F16) IO_TYPE(D_I8|Q_DFP, D_F16)
IO_TYPE(D_I8|Q_DFP, D_BF16)
IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP)
IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
@ -124,11 +127,14 @@ static vsi_bool op_check
IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM)
IO_TYPE(D_I16|Q_ASYM, D_F16) IO_TYPE(D_I16|Q_ASYM, D_F16)
IO_TYPE(D_I16|Q_ASYM, D_BF16)
IO_TYPE(D_I16|Q_ASYM, D_F32) IO_TYPE(D_I16|Q_ASYM, D_F32)
IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM)
IO_TYPE(D_I16|Q_SYM, D_BF16)
IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_SYM, D_F16) IO_TYPE(D_I8|Q_SYM, D_F16)
IO_TYPE(D_I8|Q_SYM, D_BF16)
IO_TYPE(D_BF16, D_BF16) IO_TYPE(D_BF16, D_BF16)
END_IO_TYPE_DECL(ONE_HOT) END_IO_TYPE_DECL(ONE_HOT)
if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num)) if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))

View File

@ -36,6 +36,7 @@
#include "vsi_nn_tensor_util.h" #include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h" #include "utils/vsi_nn_constraint_check.h"
#include "vsi_nn_error.h"
#define _INPUT_NUM (1) #define _INPUT_NUM (1)
#define _OUTPUT_NUM (1) #define _OUTPUT_NUM (1)
@ -50,22 +51,38 @@ static vsi_status op_compute
vsi_status status = VSI_FAILURE; vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_param_t * param = NULL;
vsi_nn_kernel_node_t n = NULL; vsi_nn_kernel_node_t n = NULL;
vsi_nn_tensor_t* reshape_tensor = NULL;
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_nn_pre_process_rgb_param* p = NULL;
memcpy(shape, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t));
shape[0] = shape[1] * shape[0];
shape[1] = shape[2];
shape[2] = 1;
reshape_tensor = vsi_nn_reshape_tensor(self->graph,
inputs[0], shape, inputs[0]->attr.dim_num);
CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor failed", final);
p = (vsi_nn_pre_process_rgb_param*)&(self->nn_param.pre_process_rgb);
param = vsi_nn_kernel_param_create(); param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb.local.scale_x ); vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x );
vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb.local.scale_y ); vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y );
vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb.rect.left ); vsi_nn_kernel_param_add_int32( param, "left", p->rect.left );
vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb.rect.top ); vsi_nn_kernel_param_add_int32( param, "top", p->rect.top );
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean ); vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean );
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean ); vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean );
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean ); vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean );
vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale ); vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale );
vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale ); vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale );
vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale ); vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale );
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel ); vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel );
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm ); vsi_nn_kernel_param_add_int32( param, "enable_perm", p->local->enable_perm );
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy ); vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy );
n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", inputs, 1, outputs, 1, param ); n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", &reshape_tensor, 1, outputs, 1, param );
if ( n != NULL ) if ( n != NULL )
{ {
self->n = (vx_node)n; self->n = (vx_node)n;
@ -77,6 +94,9 @@ static vsi_status op_compute
vsi_nn_kernel_param_release( &param ); vsi_nn_kernel_param_release( &param );
} }
final:
vsi_safe_release_tensor(reshape_tensor);
return status; return status;
} /* op_compute() */ } /* op_compute() */
@ -166,35 +186,57 @@ static vsi_bool op_setup
} }
self->nn_param.pre_process_rgb.local.enable_perm = FALSE; p->local->enable_perm = FALSE;
if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE) if (p->local->enable_perm == FALSE)
{ {
p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]); p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]); p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
} }
else else
{ {
p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]); p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]); p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
} }
p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
return TRUE; return TRUE;
} /* op_setup() */ } /* op_setup() */
static vsi_status op_init
(
vsi_nn_node_t* self
)
{
vsi_status status = VSI_SUCCESS;
self->nn_param.pre_process_rgb.local =
(vsi_nn_pre_process_rgb_lcl_data*)malloc(sizeof(vsi_nn_pre_process_rgb_lcl_data));
if (NULL == self->nn_param.pre_process_rgb.local)
{
return VX_ERROR_NO_MEMORY;
}
memset(self->nn_param.pre_process_rgb.local, 0, sizeof(vsi_nn_pre_process_rgb_lcl_data));
return status;
} /* op_init() */
static vsi_status op_deinit static vsi_status op_deinit
( (
vsi_nn_node_t * self vsi_nn_node_t * self
) )
{ {
if (self->nn_param.pre_process_rgb.local.local_tensor != NULL) if (self->nn_param.pre_process_rgb.local->local_tensor != NULL)
{ {
vxReleaseTensor(&self->nn_param.pre_process_rgb.local.local_tensor); vxReleaseTensor(&self->nn_param.pre_process_rgb.local->local_tensor);
self->nn_param.pre_process_rgb.local.local_tensor = NULL; self->nn_param.pre_process_rgb.local->local_tensor = NULL;
} }
vsi_nn_safe_free(self->nn_param.pre_process_rgb.local);
vsi_nn_op_common_deinit(self); vsi_nn_op_common_deinit(self);
return VSI_SUCCESS; return VSI_SUCCESS;
@ -208,7 +250,7 @@ extern "C" {
DEF_OP_REG DEF_OP_REG
( (
/* op_name */ PRE_PROCESS_RGB, /* op_name */ PRE_PROCESS_RGB,
/* init */ NULL, /* init */ op_init,
/* compute */ op_compute, /* compute */ op_compute,
/* deinit */ op_deinit, /* deinit */ op_deinit,
/* check */ op_check, /* check */ op_check,

View File

@ -79,7 +79,10 @@ static vsi_status _prelu_op_compute
vsi_status status = VSI_FAILURE; vsi_status status = VSI_FAILURE;
vsi_nn_prelu_param *prelu = &self->nn_param.prelu; vsi_nn_prelu_param *prelu = &self->nn_param.prelu;
vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 }; vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 };
vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_nn_tensor_t* input0 = NULL;
vsi_nn_tensor_t* input1 = NULL;
vsi_nn_tensor_t* output = NULL;
vsi_bool one_rank = FALSE; vsi_bool one_rank = FALSE;
vsi_bool is_per_channel_alpha = 0; vsi_bool is_per_channel_alpha = 0;
vsi_size_t alpha_shape = 1; vsi_size_t alpha_shape = 1;
@ -88,6 +91,7 @@ static vsi_status _prelu_op_compute
uint32_t dims = outputs[0]->attr.dim_num; uint32_t dims = outputs[0]->attr.dim_num;
reshape_tensors[0] = inputs[0]; reshape_tensors[0] = inputs[0];
reshape_tensors[2] = outputs[0];
one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape); one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape);
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
@ -114,18 +118,23 @@ static vsi_status _prelu_op_compute
dims = inputs[1]->attr.dim_num; dims = inputs[1]->attr.dim_num;
} }
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, input1 = vsi_nn_reshape_tensor( self->graph,
inputs[1], (vsi_size_t*)shapes, dims ); inputs[1], (vsi_size_t*)shapes, dims );
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
reshape_tensors[1] = input1;
} }
else else
{ {
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t)); memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, input1 = vsi_nn_reshape_tensor( self->graph,
inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num ); inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num );
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
reshape_tensors[1] = input1;
} }
} }
else else
{ {
uint32_t rank = inputs[0]->attr.dim_num;
dims = inputs[1]->attr.dim_num; dims = inputs[1]->attr.dim_num;
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t)); memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
@ -141,9 +150,32 @@ static vsi_status _prelu_op_compute
shapes[1] = 1; shapes[1] = 1;
dims = 2; dims = 2;
} }
else if (one_rank && inputs[1]->attr.is_const == TRUE &&
alpha_shape == inputs[0]->attr.size[0] &&
alpha_shape == inputs[1]->attr.size[0] &&
rank < 3)
{
is_per_channel_alpha = TRUE;
shapes[0] = 1;
shapes[1] = 1;
shapes[2] = alpha_shape;
shapes[3] = rank > 1 ? inputs[0]->attr.size[1] : 1;
dims = 4;
input0 = vsi_nn_reshape_tensor(self->graph, inputs[0], (vsi_size_t*)shapes, dims);
CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
reshape_tensors[0] = input0;
output = vsi_nn_reshape_tensor(self->graph, outputs[0], (vsi_size_t*)shapes, dims);
CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
reshape_tensors[2] = output;
shapes[0] = alpha_shape;
shapes[1] = 1;
dims = 2;
}
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, input1 = vsi_nn_reshape_tensor( self->graph,
inputs[1], (vsi_size_t*)shapes, dims ); inputs[1], (vsi_size_t*)shapes, dims );
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
reshape_tensors[1] = input1;
} }
// Add params // Add params
@ -153,15 +185,19 @@ static vsi_status _prelu_op_compute
self->n = (vx_node)vsi_nn_kernel_selector( self->graph, self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
kernel_name, kernel_name,
&reshape_tensors[0], 2, &reshape_tensors[0], 2,
outputs, 1, param ); &reshape_tensors[2], 1, param );
vsi_nn_kernel_param_release( &param ); vsi_nn_kernel_param_release( &param );
vsi_nn_ReleaseTensor( &reshape_tensors[1] );
if ( self->n ) if ( self->n )
{ {
status = VSI_SUCCESS; status = VSI_SUCCESS;
} }
final:
vsi_safe_release_tensor(input0);
vsi_safe_release_tensor(input1);
vsi_safe_release_tensor(output);
return status; return status;
} /* _prelu_op_compute() */ } /* _prelu_op_compute() */
@ -219,8 +255,16 @@ static vsi_bool op_check
IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) IO_TYPE(D_I8|Q_DFP, D_F16, D_F16)
IO_TYPE(D_I8|Q_SYM, D_F16, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM)
IO_TYPE(D_I8|Q_SYM, D_F16, D_F16)
IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16)
IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
IO_TYPE(D_I16|Q_SYM, D_F16, D_I16|Q_SYM)
IO_TYPE(D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16)
IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16)
IO_TYPE(D_BF16, D_F16, D_BF16) IO_TYPE(D_BF16, D_F16, D_BF16)
IO_TYPE(D_BF16, D_BF16, D_BF16) IO_TYPE(D_BF16, D_BF16, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)

View File

@ -162,7 +162,7 @@ static vsi_bool _check_is_sp_supported_type
int32_t * axes = self->nn_param.reduce.local2->axes; int32_t * axes = self->nn_param.reduce.local2->axes;
int32_t axes_num = self->nn_param.reduce.local2->axes_num; int32_t axes_num = self->nn_param.reduce.local2->axes_num;
if ( !self->graph->ctx->config.support_stream_processor || if ( !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
(type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) ) (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
{ {
return FALSE; return FALSE;
@ -788,7 +788,7 @@ static vsi_bool op_set_reduce_axis(
} }
*out_rank_x = inputs[0]->attr.dim_num; *out_rank_x = inputs[0]->attr.dim_num;
} }
else if (!self->graph->ctx->config.support_stream_processor || else if (!((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
resolved_dim_count > 2) resolved_dim_count > 2)
{ {
optimzation_input_size( optimzation_input_size(

View File

@ -61,7 +61,7 @@ static vsi_status op_compute
vx_nn_reshape_params_t reshape_param; vx_nn_reshape_params_t reshape_param;
memset(&attr, 0, sizeof(attr)); memset(&attr, 0, sizeof(attr));
attr.size[0] = self->nn_param.reshape.dim_num; attr.size[0] = vsi_nn_max(self->nn_param.reshape.dim_num, 1);
attr.dim_num = 1; attr.dim_num = 1;
attr.is_const = TRUE; attr.is_const = TRUE;
attr.dtype.vx_type = VSI_NN_TYPE_INT32; attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@ -123,6 +123,16 @@ static vsi_bool op_setup
{ {
vsi_bool ret = TRUE; vsi_bool ret = TRUE;
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
if (self->nn_param.reshape.dim_num == 0 ||
self->nn_param.reshape.size == NULL
)
{
outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
else
{ {
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
uint32_t i = 0; uint32_t i = 0;
@ -136,6 +146,7 @@ static vsi_bool op_setup
shape, shape,
self->nn_param.reshape.dim_num); self->nn_param.reshape.dim_num);
} }
}
return ret; return ret;
} /* op_setup() */ } /* op_setup() */

View File

@ -66,7 +66,7 @@ static vsi_status op_compute
} }
memset(&attr, 0, sizeof(attr)); memset(&attr, 0, sizeof(attr));
attr.size[0] = self->nn_param.reshape2.dim_num; attr.size[0] = vsi_nn_max(self->nn_param.reshape2.dim_num, 1);
attr.dim_num = 1; attr.dim_num = 1;
attr.is_const = TRUE; attr.is_const = TRUE;
attr.dtype.vx_type = VSI_NN_TYPE_INT32; attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@ -160,6 +160,16 @@ static vsi_bool op_setup
{ {
vsi_bool ret = TRUE; vsi_bool ret = TRUE;
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
if (self->nn_param.reshape2.dim_num == 0 ||
self->nn_param.reshape2.size == NULL
)
{
outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
else
{ {
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
memcpy(shape, self->nn_param.reshape2.size, memcpy(shape, self->nn_param.reshape2.size,
@ -169,6 +179,7 @@ static vsi_bool op_setup
shape, shape,
self->nn_param.reshape2.dim_num); self->nn_param.reshape2.dim_num);
} }
}
return ret; return ret;
} /* op_setup() */ } /* op_setup() */

View File

@ -0,0 +1,145 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
#include "utils/vsi_nn_dtype_util.h"
#include "vsi_nn_error.h"
typedef struct _rope_local_data_t {
int32_t placeholder;
} rope_local_data_t;
/*
Declare number of input and output.
*/
#define _INPUT_NUM (3)
#define _OUTPUT_NUM (1)
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t* param = NULL;
int32_t axis = self->nn_param.rope.axis;
vsi_bool interleaved = self->nn_param.rope.interleaved;
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32(param, "axis", axis);
vsi_nn_kernel_param_add_int32(param, "interleaved", interleaved);
self->n = (vx_node)vsi_nn_kernel_selector(self->graph, "rope",
inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param);
if ( self->n )
{
status = VSI_SUCCESS;
}
if (param != NULL)
{
vsi_nn_kernel_param_release(&param);
}
return status;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
BEGIN_IO_TYPE_DECL(ROPE, _INPUT_NUM, _OUTPUT_NUM)
IO_TYPE(D_F32, D_F32, D_F32, D_F32)
IO_TYPE(D_BF16, D_BF16, D_BF16, D_BF16)
IO_TYPE(D_F16, D_F16, D_F16, D_F16)
IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM)
IO_TYPE(D_U16|Q_ASYM, D_U16|Q_ASYM, D_U16|Q_ASYM, D_U16|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP)
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM, D_I8|Q_SYM)
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM, D_U8|Q_ASYM)
IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_SYM, D_F16, D_F16, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16, D_I8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16, D_I16|Q_SYM)
IO_TYPE(D_U16|Q_ASYM, D_F16, D_F16, D_U16|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_I8|Q_DFP)
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16, D_I8|Q_SYM)
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16, D_U8|Q_ASYM)
END_IO_TYPE_DECL(ROPE)
if (!VALIDATE_OP_IO_TYPES(ROPE, self, inputs, self->input.num, outputs, self->output.num))
{
char* desc = generate_op_io_types_desc(inputs,
self->input.num, outputs, self->output.num);
VSILOGE("Inputs/Outputs data type not support: %s", desc);
destroy_op_io_types_desc(desc);
return FALSE;
}
return TRUE;
} /* op_check() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ ROPE,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ vsi_nn_op_common_deinit,
/* check */ op_check,
/* setup */ vsi_nn_op_common_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -25,6 +25,7 @@
#include <stdlib.h> #include <stdlib.h>
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h" #include "vsi_nn_platform.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_node.h" #include "vsi_nn_node.h"
@ -188,7 +189,7 @@ static vsi_status op_optimize
} }
if ( _need_split_softmax(self, inputs) == FALSE || if ( _need_split_softmax(self, inputs) == FALSE ||
self->nn_param.softmax_internal.axis != 0 || self->nn_param.softmax_internal.axis != 0 ||
self->graph->ctx->config.support_stream_processor ) ((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
{ {
return status; return status;
} }

View File

@ -39,6 +39,10 @@
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#include "vsi_nn_error.h" #include "vsi_nn_error.h"
typedef struct _topk_local_data_t {
vsi_bool use_internal_node;
} topk_local_data_t;
#define _INPUT_NUM (1) #define _INPUT_NUM (1)
#define _OUTPUT_NUM (2) #define _OUTPUT_NUM (2)
@ -111,11 +115,35 @@ static vsi_status op_compute
vsi_nn_tensor_t * out1_tensor = NULL; vsi_nn_tensor_t * out1_tensor = NULL;
vsi_bool ret = FALSE; vsi_bool ret = FALSE;
if (inputs[0]->attr.size[axis] == 1) if (self->nn_param.topk.local->use_internal_node)
{ {
return vsi_nn_internal_compute_node( self ); return vsi_nn_internal_compute_node( self );
} }
if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH)
{
int32_t i = 1;
shapes[0][0] = inputs[0]->attr.size[0];
shapes[1][0] = outputs[0]->attr.size[0];
shapes[0][1] = 1;
shapes[1][1] = 1;
for (i = 1; i < (int32_t)(inputs[0]->attr.dim_num); i++)
{
shapes[0][1] = shapes[0][1] * inputs[0]->attr.size[i];
}
for (i = 1; i < (int32_t)(outputs[0]->attr.dim_num); i++)
{
shapes[1][1] = shapes[1][1] * outputs[0]->attr.size[i];
}
new_axis0 = axis;
new_axis1 = axis;
rank_in = 2;
rank_out = 2;
ret = TRUE;
}
else
{
ret = vsi_nn_kernel_optimize_softmax_shape( ret = vsi_nn_kernel_optimize_softmax_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
shapes[0], &rank_in, &new_axis0); shapes[0], &rank_in, &new_axis0);
@ -123,7 +151,7 @@ static vsi_status op_compute
ret = vsi_nn_kernel_optimize_softmax_shape( ret = vsi_nn_kernel_optimize_softmax_shape(
outputs[0]->attr.size, outputs[0]->attr.dim_num, axis, outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
shapes[1], &rank_out, &new_axis1); shapes[1], &rank_out, &new_axis1);
}
if (ret) if (ret)
{ {
uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0}; uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
@ -303,6 +331,8 @@ static vsi_bool op_setup
vsi_nn_internal_tensor_t* const0_input = NULL; vsi_nn_internal_tensor_t* const0_input = NULL;
vsi_nn_tensor_attr_t attr; vsi_nn_tensor_attr_t attr;
p->local->use_internal_node = TRUE;
vsi_nn_internal_init_node_wksp(self); vsi_nn_internal_init_node_wksp(self);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@ -322,6 +352,38 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[1]; curr->outputs[0] = outputs[1];
vsi_nn_internal_setup_node(self, curr); vsi_nn_internal_setup_node(self, curr);
} }
else if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
{
vsi_nn_internal_node_t* curr = NULL;
vsi_nn_internal_tensor_t* temp_tensor = NULL;
vsi_nn_tensor_attr_t attr;
p->local->use_internal_node = TRUE;
vsi_nn_internal_init_node_wksp(self);
memcpy(&attr, &inputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
attr.dim_num = VSI_NN_DIM_AUTO;
attr.vtl = TRUE;
attr.is_const = FALSE;
temp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
CHECK_PTR_FAIL_GOTO(temp_tensor, "Create tensor failed", final);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_TOPK, 1, 2);
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.topk.axis = p->axis;
curr->node->nn_param.topk.k = p->k;
curr->inputs[0] = inputs[0];
curr->outputs[0] = temp_tensor->t;
curr->outputs[1] = outputs[1];
vsi_nn_internal_setup_node(self, curr);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = temp_tensor->t;
curr->outputs[0] = outputs[0];
vsi_nn_internal_setup_node(self, curr);
}
return TRUE; return TRUE;
final: final:
@ -341,7 +403,7 @@ static vsi_status op_optimize
VSI_UNREFERENCED(outputs); VSI_UNREFERENCED(outputs);
p = &(self->nn_param.topk); p = &(self->nn_param.topk);
if (inputs[0]->attr.size[p->axis] == 1) if (p->local->use_internal_node)
{ {
return vsi_nn_internal_optimize_node( self, direction ); return vsi_nn_internal_optimize_node( self, direction );
} }
@ -357,6 +419,14 @@ static vsi_status op_init
vsi_status status = VSI_SUCCESS; vsi_status status = VSI_SUCCESS;
self->nn_param.topk.axis = 0; self->nn_param.topk.axis = 0;
self->nn_param.topk.local = \
(topk_local_data_t*)malloc(sizeof(topk_local_data_t));
if (NULL == self->nn_param.topk.local)
{
return VX_ERROR_NO_MEMORY;
}
memset(self->nn_param.topk.local, 0, sizeof(topk_local_data_t));
return status; return status;
} /* op_init() */ } /* op_init() */
@ -364,8 +434,13 @@ static vsi_status op_deinit
( (
vsi_nn_node_t * self vsi_nn_node_t * self
) )
{
if (self->nn_param.topk.local->use_internal_node)
{ {
vsi_nn_internal_deinit_node_wksp(self); vsi_nn_internal_deinit_node_wksp(self);
}
vsi_nn_safe_free(self->nn_param.topk.local);
vsi_nn_op_common_deinit(self); vsi_nn_op_common_deinit(self);
return VSI_SUCCESS; return VSI_SUCCESS;

View File

@ -475,6 +475,7 @@ static _op_param_gen_t s_op_gen[] =
/* GROUPED_CONV3D */ NULL, /* GROUPED_CONV3D */ NULL,
/* COL2IM */ NULL, /* COL2IM */ NULL,
/* L1_LAYER_NORM */ NULL, /* L1_LAYER_NORM */ NULL,
/* ROPE */ NULL,
}; };
_compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );

View File

@ -98,7 +98,7 @@ static VSI_INLINE_API void _convert_bfloat16_to_float
uint32_t i; uint32_t i;
for( i = 0; i < size; i ++ ) for( i = 0; i < size; i ++ )
{ {
out_buffer[i] = bfp16_to_fp32( (int16_t)buffer[i] ); out_buffer[i] = bfp16_to_fp32( (uint16_t)buffer[i] );
} }
} /* _convert_bfloat16_to_float */ } /* _convert_bfloat16_to_float */

View File

@ -40,6 +40,7 @@
#include "vsi_nn_prv.h" #include "vsi_nn_prv.h"
#include "vsi_nn_graph.h" #include "vsi_nn_graph.h"
#include "vsi_nn_types.h" #include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h" #include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h" #include "vsi_nn_tensor_util.h"
#include "vsi_nn_log.h" #include "vsi_nn_log.h"
@ -1261,7 +1262,9 @@ vsi_bool vsi_nn_is_same_quant_type(
break; break;
} }
#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT #ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: { case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
{
const float diff = (float)1e-5; const float diff = (float)1e-5;
int32_t i = 0; int32_t i = 0;
int32_t scale_cnt0 = src_dtype->group_count; int32_t scale_cnt0 = src_dtype->group_count;
@ -1627,12 +1630,12 @@ vsi_bool vsi_nn_is_stream_process_supported_types
{ {
size_t i = 0; size_t i = 0;
if ( graph->ctx->config.support_stream_processor == 0 ) if ( ((vsi_nn_graph_prv_t*)graph)->options->config.support_stream_processor == 0 )
{ {
return FALSE; return FALSE;
} }
if ( graph->ctx->config.sp_exec_count == 0 ) if ( ((vsi_nn_graph_prv_t*)graph)->options->config.sp_exec_count == 0 )
{ {
return FALSE; return FALSE;
} }
@ -1769,3 +1772,11 @@ typedef enum
return support; return support;
} }
uint32_t vsi_nn_get_tensor_dims
(
vsi_nn_tensor_t* tensor
)
{
return vsi_nn_GetTensorIsScalar(tensor) ? 0 : tensor->attr.dim_num;
}

View File

@ -39,6 +39,9 @@ static vsi_status query_hardware_caps
#endif #endif
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
vx_hardware_caps_params_ext_t paramExt; vx_hardware_caps_params_ext_t paramExt;
#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
vx_hardware_caps_params_ext3_t paramExt3;
#endif
memset(&paramExt, 0, sizeof(vx_hardware_caps_params_ext_t)); memset(&paramExt, 0, sizeof(vx_hardware_caps_params_ext_t));
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt), status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt),
@ -73,6 +76,13 @@ static vsi_status query_hardware_caps
} }
#endif #endif
#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
memset(&paramExt3, 0, sizeof(vx_hardware_caps_params_ext3_t));
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt3),
sizeof(vx_hardware_caps_params_ext3_t));
context->config.support_ffd = paramExt3.supportFixedFunctionDevice;
#endif
#endif #endif
if(param.evis1 == TRUE && param.evis2 == FALSE) if(param.evis1 == TRUE && param.evis2 == FALSE)
@ -93,6 +103,85 @@ final:
return status; return status;
} }
vsi_status query_hardware_caps_runtime
(
vsi_nn_context_t context,
vsi_nn_runtime_option_t* options
)
{
vsi_status status = VSI_FAILURE;
vx_hardware_caps_params_t param;
VSI_UNREFERENCED(options);
memset(&(options->config), 0, sizeof(vsi_nn_hw_config_t));
#if VX_STREAM_PROCESSOR_SUPPORT
vx_hardware_caps_params_ext2_t paramExt2;
#endif
#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
vx_hardware_caps_params_ext3_t paramExt3;
#endif
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
vx_hardware_caps_params_ext_t paramExt;
memset(&paramExt, 0, sizeof(vx_hardware_caps_params_ext_t));
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt),
sizeof(vx_hardware_caps_params_ext_t));
param.evis1 = paramExt.base.evis1;
param.evis2 = paramExt.base.evis2;
#else
memset(&param, 0, sizeof(vx_hardware_caps_params_t));
status = vxQueryHardwareCaps(context->c, &param, sizeof(vx_hardware_caps_params_t));
#endif
TEST_CHECK_STATUS(status, final);
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
options->config.subGroupSize = paramExt.subGroupSize;
#ifdef VSI_40BIT_VA_SUPPORT
options->config.use_40bits_va = paramExt.supportVA40;
#endif
#if VX_STREAM_PROCESSOR_SUPPORT
memset(&paramExt2, 0, sizeof(vx_hardware_caps_params_ext2_t));
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt2),
sizeof(vx_hardware_caps_params_ext2_t));
if (options->enable_stream_processor)
{
options->config.support_stream_processor = paramExt.supportStreamProcessor;
options->config.sp_exec_count = paramExt2.streamProcessorExecCount;
options->config.sp_vector_depth = paramExt2.streamProcessorVectorSize;
if (options->config.sp_exec_count > 0)
{
options->config.sp_per_core_vector_depth =
options->config.sp_vector_depth / options->config.sp_exec_count;
}
}
#endif
#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
memset(&paramExt3, 0, sizeof(vx_hardware_caps_params_ext3_t));
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt3),
sizeof(vx_hardware_caps_params_ext3_t));
options->config.support_ffd = paramExt3.supportFixedFunctionDevice;
#endif
#endif
if(param.evis1 == TRUE && param.evis2 == FALSE)
{
options->config.evis.ver = VSI_NN_HW_EVIS_1;
}
else if(param.evis1 == FALSE && param.evis2 == TRUE)
{
options->config.evis.ver = VSI_NN_HW_EVIS_2;
}
else
{
options->config.evis.ver = VSI_NN_HW_EVIS_NONE;
VSILOGW("Unsupported evis version");
}
final:
return status;
}
#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30)) #if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER"; static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK"; static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
@ -153,6 +242,44 @@ vsi_status vsi_nn_initOptions
return VSI_SUCCESS; return VSI_SUCCESS;
} }
vsi_status vsi_nn_initOptions_runtime
(
vsi_nn_runtime_option_t *options,
vsi_nn_context_t ctx
)
{
int32_t default_value = 1;
options->enable_shader = vsi_nn_getenv_asint(ENV_ENABLE_SHADER, 1);
options->enable_opcheck = vsi_nn_getenv_asint(ENV_ENABLE_OPCHECK, 1);
#if (VX_CONCAT_OPT_SUPPORT)
default_value = 0;
#else
default_value = 1;
#endif
options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value);
options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1);
options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1);
options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0);
#if (VX_STRIDED_SLICE_OPT_SUPPORT)
default_value = 0;
#else
default_value = 1;
#endif
options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value);
options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0);
options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0);
options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1);
options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1);
/*init hw params*/
options->config = ctx->config;
return VSI_SUCCESS;
}
vsi_nn_context_t vsi_nn_CreateContext vsi_nn_context_t vsi_nn_CreateContext
( void ) ( void )
{ {

View File

@ -1362,7 +1362,7 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
graph->isAllowFastMode = TRUE; graph->isAllowFastMode = TRUE;
vsi_nn_MapInit( graph->node_table ); vsi_nn_MapInit( graph->node_table );
vsi_nn_MapInit( graph->tensor_table ); vsi_nn_MapInit( graph->tensor_table );
vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options ); vsi_nn_initOptions_runtime( ((vsi_nn_graph_prv_t*) graph)->options, ctx );
} }
else else
{ {
@ -3398,6 +3398,7 @@ char* vsi_nn_GetRunTimeVariable
#define varSize 256 #define varSize 256
char* value_str = (char*)malloc(sizeof(char) * varSize); char* value_str = (char*)malloc(sizeof(char) * varSize);
CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final); CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final);
CHECK_PTR_FAIL_GOTO(graph, "Graph is NULL!", final);
memset(value_str, 0, varSize); memset(value_str, 0, varSize);
char tmp_value[varSize] = {0}; char tmp_value[varSize] = {0};
VSI_UNREFERENCED(tmp_value); VSI_UNREFERENCED(tmp_value);
@ -3502,6 +3503,8 @@ vsi_status vsi_nn_SetRunTimeVariable
break; break;
case VSI_VX_ENABLE_STREAM_PROCESSOR: case VSI_VX_ENABLE_STREAM_PROCESSOR:
options->enable_stream_processor = atoi(value); options->enable_stream_processor = atoi(value);
options->config.support_stream_processor = atoi(value);
status = query_hardware_caps_runtime(graph->ctx, options);
break; break;
case VSI_VX_ENABLE_BATCH_OPT: case VSI_VX_ENABLE_BATCH_OPT:
options->enable_batch_opt = atoi(value); options->enable_batch_opt = atoi(value);

View File

@ -895,10 +895,13 @@ static void _convert_const_I8toU8
attr->dtype.vx_type = VSI_NN_TYPE_UINT8; attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
attr->dtype.zero_point += 128; attr->dtype.zero_point += 128;
if (tensor->t) vxReleaseTensor(&tensor->t); if (tensor->t) vxReleaseTensor(&tensor->t);
tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr); tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr);
#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
int32_t is_sparsity = 0;
is_sparsity = vsi_nn_GetTensorIsSparsity(tensor);
vsi_nn_SetTensorIsSparsity(tensor, is_sparsity);
#endif
final: final:
vsi_nn_safe_free( data ); vsi_nn_safe_free( data );
}/* _convert_const_I8toU8() */ }/* _convert_const_I8toU8() */

View File

@ -247,7 +247,8 @@ static void _set_preproc_node_input_attr
vsi_nn_tensor_attr_t* attr, vsi_nn_tensor_attr_t* attr,
vsi_nn_preprocess_image_size_t* input_size, vsi_nn_preprocess_image_size_t* input_size,
vsi_nn_preprocess_source_format_e* source_format, vsi_nn_preprocess_source_format_e* source_format,
vsi_nn_preprocess_source_layout_e* source_layout vsi_nn_preprocess_source_layout_e* source_layout,
vsi_nn_preprocess_dtype_convert_t* data_convert
) )
{ {
*input_attr = *attr; *input_attr = *attr;
@ -265,10 +266,17 @@ static void _set_preproc_node_input_attr
} }
} }
if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR) if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR)
{
if(data_convert != NULL)
{
input_attr->dtype = data_convert->dtype;
}
else
{ {
input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32; input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32;
} }
}
else else
{ {
input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
@ -276,16 +284,16 @@ static void _set_preproc_node_input_attr
} }
if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB) if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB)
{ {
if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) if (*source_layout == VSI_NN_SOURCE_LAYOUT_NCHW)
{ {
input_attr->size[0] = input_attr->size[1]*input_attr->size[0]; vsi_size_t channel = input_attr->size[2];
input_attr->size[1] = input_attr->size[2]; if (channel != 3)
input_attr->size[2] = 1; {
VSILOGE("RGB chanel must be 3, please have a check!");
} }
else input_attr->size[2] = input_attr->size[1];
{ input_attr->size[1] = input_attr->size[0];
input_attr->size[0] = input_attr->size[2]*input_attr->size[0]; input_attr->size[0] = channel;
input_attr->size[2] = 1;
} }
} }
@ -333,15 +341,10 @@ static void _set_preproc_node_input_attr
static void _set_preproc_node_output_attr static void _set_preproc_node_output_attr
( (
vsi_nn_tensor_attr_t* output_attr, vsi_nn_tensor_attr_t* output_attr,
vsi_nn_tensor_attr_t* attr, vsi_nn_tensor_attr_t* attr
vsi_nn_preprocess_dtype_convert_t* data_convert
) )
{ {
*output_attr = *attr; *output_attr = *attr;
if(data_convert != NULL)
{
output_attr->dtype = data_convert->dtype;
}
output_attr->dtype.fmt = VSI_NN_DIM_FMT_NCHW; output_attr->dtype.fmt = VSI_NN_DIM_FMT_NCHW;
output_attr->dim_num = VSI_NN_DIM_AUTO; output_attr->dim_num = VSI_NN_DIM_AUTO;
output_attr->is_const = FALSE; output_attr->is_const = FALSE;
@ -603,10 +606,11 @@ vsi_status vsi_nn_add_single_preproc_node
_set_preproc_node_out_attr(node, image_resize, &org_norm_tensor->attr, source_layout); _set_preproc_node_out_attr(node, image_resize, &org_norm_tensor->attr, source_layout);
/* Set input tensor attr */ /* Set input tensor attr */
_set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size, source_format, source_layout); _set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size,
source_format, source_layout, data_convert);
/* Set output tensor attr */ /* Set output tensor attr */
_set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr, data_convert); _set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr);
/* Create new norm and virtual tensors */ /* Create new norm and virtual tensors */
if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 ||

View File

@ -33,6 +33,7 @@
#include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_util.h" #include "utils/vsi_nn_util.h"
#include "vsi_nn_rnn_helper.h" #include "vsi_nn_rnn_helper.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_error.h" #include "vsi_nn_error.h"
vsi_bool vsi_nn_rnn_find_best_kernel_size vsi_bool vsi_nn_rnn_find_best_kernel_size
@ -804,7 +805,7 @@ vsi_status vsi_nn_rnn_data_check_aligned
vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size, vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size,
input[i]->attr.dim_num, input[i]->attr.dtype.vx_type ); input[i]->attr.dim_num, input[i]->attr.dtype.vx_type );
if( ofst & 0x3f && !self->graph->ctx->config.support_stream_processor) if( ofst & 0x3f && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
{ {
vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor); vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );

View File

@ -155,6 +155,15 @@ static void print_tensor
tensor->attr.dtype.group_size); tensor->attr.dtype.group_size);
ext_attr[count] = 0; ext_attr[count] = 0;
break; break;
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
count = snprintf(&ext_attr[0],
_EXT_ATTR_BUF_SZ,
"ASYM GPTQ axis=%d, count=%d, group_size=%d",
tensor->attr.dtype.group_channel_dim,
tensor->attr.dtype.group_count,
tensor->attr.dtype.group_size);
ext_attr[count] = 0;
break;
#endif #endif
default: default:
vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ); vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
@ -449,6 +458,11 @@ static vsi_bool _init_tensor
scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count); scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final ); CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float)); memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
memcpy(zeroPoints,
tensor->attr.dtype.zero_points,
tensor->attr.dtype.zero_points_dim * sizeof(int32_t));
params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim; params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size; params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count; params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
@ -460,6 +474,32 @@ static vsi_bool _init_tensor
VSILOGE( VSILOGE(
"can't support qnt_type " "can't support qnt_type "
"VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC."); "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC.");
break;
#endif
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP;
// This is a hack that driver doesn't support const scales
scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
memcpy(zeroPoints,
tensor->attr.dtype.group_zero_points,
tensor->attr.dtype.group_count * sizeof(int32_t));
params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
params.quant_data.affinePerGroup.scales = scales;
params.quant_data.affinePerGroup.zero_points = zeroPoints;
params.quant_data.affinePerGroup.zero_point_group_count = tensor->attr.dtype.group_count;
break;
#else
VSILOGE(
"can't support qnt_type "
"VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC.");
break;
#endif #endif
default: default:
break; break;
@ -1788,6 +1828,57 @@ int8_t vsi_nn_GetTensorIsScalar
return _get_tensor_is_scalar((vsi_nn_tensor_prv_t*)tensor); return _get_tensor_is_scalar((vsi_nn_tensor_prv_t*)tensor);
} }
int32_t _get_tensor_is_sparsity
(
vsi_nn_tensor_prv_t* tensor
)
{
int32_t is_sparsity = FALSE;
if (NULL == tensor)
{
VSILOGE("To get is_sparsity, tensor pointer SHOULD NOT be NULL.");
goto final;
}
#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
is_sparsity = tensor->sparsity_type;
#endif
final:
return is_sparsity;
}
int32_t vsi_nn_GetTensorIsSparsity
(
vsi_nn_tensor_t* tensor
)
{
return _get_tensor_is_sparsity((vsi_nn_tensor_prv_t*)tensor);
}
vsi_status vsi_nn_SetTensorIsSparsity
(
vsi_nn_tensor_t* tensor,
int32_t is_sparsity
)
{
VSI_UNREFERENCED(is_sparsity);
vsi_status status = VSI_SUCCESS;
if (NULL == tensor) {
status = VSI_FAILURE;
goto final;
}
#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
vxSetTensorAttribute(tensor->t,
VX_TENSOR_SPARSITY_TYPE,
&is_sparsity,
sizeof(vx_enum));
status = VSI_SUCCESS;
((vsi_nn_tensor_prv_t*)tensor)->sparsity_type = is_sparsity;
#endif
final:
return status;
}
vsi_status vsi_nn_CopyRawDataToTensor vsi_status vsi_nn_CopyRawDataToTensor
( (
vsi_nn_graph_t* graph, vsi_nn_graph_t* graph,

View File

@ -75,6 +75,11 @@ vsi_status _set_tensor_is_scalar
int8_t is_salar int8_t is_salar
); );
vsi_status _set_tensor_is_sparsity(
vsi_nn_tensor_prv_t* tensor,
int32_t is_sparsity
);
int8_t _get_tensor_is_from_axisram int8_t _get_tensor_is_from_axisram
( (
vsi_nn_tensor_prv_t* tensor vsi_nn_tensor_prv_t* tensor
@ -127,6 +132,11 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
vsi_nn_opt_direction_e direction vsi_nn_opt_direction_e direction
); );
uint32_t vsi_nn_get_tensor_dims
(
vsi_nn_tensor_t* tensor
);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -108,6 +108,11 @@ typedef struct _vsi_nn_tensor_prv
/** create tensor from axisram.*/ /** create tensor from axisram.*/
int8_t is_from_axisram; int8_t is_from_axisram;
/** 2:4 sparsity attr. */
#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
vx_tensor_sparsity_param_e sparsity_type; /*!< \brief sparsity type for the tensor */
#endif
// Add tensor internal attribute here... // Add tensor internal attribute here...
} vsi_nn_tensor_prv_t; } vsi_nn_tensor_prv_t;