Update internal ovxlib to release/1.2.22 (#706)
* Update internal ovxlib to release/1.2.22 Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com> * Refine yaml file for blocking tfhub model tests Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com> --------- Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
parent
149834832c
commit
8494275d76
|
|
@ -124,7 +124,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
git config --global user.email "xiang.zhang@verisilicon.com"
|
git config --global user.email "xiang.zhang@verisilicon.com"
|
||||||
git config --global user.name "xiang.zhang"
|
git config --global user.name "xiang.zhang"
|
||||||
git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.10.0
|
git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.16.1
|
||||||
git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate
|
git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate
|
||||||
cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF
|
cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF
|
||||||
cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}}
|
cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}}
|
||||||
|
|
@ -283,61 +283,61 @@ jobs:
|
||||||
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||||
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite
|
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite
|
||||||
|
|
||||||
tfhub-efficientdet-lite0:
|
# tfhub-efficientdet-lite0:
|
||||||
runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
needs: [vx-delegate-build, tim-vx-unit-test]
|
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||||
steps:
|
# steps:
|
||||||
- name: download test binary
|
# - name: download test binary
|
||||||
uses: actions/download-artifact@v3
|
# uses: actions/download-artifact@v3
|
||||||
- name: download model
|
# - name: download model
|
||||||
run: |
|
# run: |
|
||||||
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
|
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
|
||||||
- name: benchmark-model
|
# - name: benchmark-model
|
||||||
run: |
|
# run: |
|
||||||
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||||
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||||
|
|
||||||
tfhub-efficientdet-lite1:
|
# tfhub-efficientdet-lite1:
|
||||||
runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
needs: [vx-delegate-build, tim-vx-unit-test]
|
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||||
steps:
|
# steps:
|
||||||
- name: download test binary
|
# - name: download test binary
|
||||||
uses: actions/download-artifact@v3
|
# uses: actions/download-artifact@v3
|
||||||
- name: download model
|
# - name: download model
|
||||||
run: |
|
# run: |
|
||||||
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
|
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
|
||||||
- name: benchmark-model
|
# - name: benchmark-model
|
||||||
run: |
|
# run: |
|
||||||
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||||
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||||
|
|
||||||
tfhub-efficientdet-lite2:
|
# tfhub-efficientdet-lite2:
|
||||||
runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
needs: [vx-delegate-build, tim-vx-unit-test]
|
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||||
steps:
|
# steps:
|
||||||
- name: download test binary
|
# - name: download test binary
|
||||||
uses: actions/download-artifact@v3
|
# uses: actions/download-artifact@v3
|
||||||
- name: download model
|
# - name: download model
|
||||||
run: |
|
# run: |
|
||||||
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
|
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
|
||||||
- name: benchmark-model
|
# - name: benchmark-model
|
||||||
run: |
|
# run: |
|
||||||
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||||
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||||
|
|
||||||
tfhub-efficientdet-lite3:
|
# tfhub-efficientdet-lite3:
|
||||||
runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
needs: [vx-delegate-build, tim-vx-unit-test]
|
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||||
steps:
|
# steps:
|
||||||
- name: download test binary
|
# - name: download test binary
|
||||||
uses: actions/download-artifact@v3
|
# uses: actions/download-artifact@v3
|
||||||
- name: download model
|
# - name: download model
|
||||||
run: |
|
# run: |
|
||||||
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
|
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
|
||||||
- name: benchmark-model
|
# - name: benchmark-model
|
||||||
run: |
|
# run: |
|
||||||
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||||
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||||
|
|
||||||
# acuity-yolov3-608-quant:
|
# acuity-yolov3-608-quant:
|
||||||
# runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
|
|
|
||||||
|
|
@ -9,3 +9,4 @@ DEF_NODE_TYPE(custom_sample)
|
||||||
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
|
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
|
||||||
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
|
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
|
||||||
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
|
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
|
||||||
|
DEF_NODE_TYPE(custom_letterbox)
|
||||||
|
|
@ -9,3 +9,4 @@ DEF_OP(CUSTOM_SAMPLE)
|
||||||
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
|
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
|
||||||
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
|
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
|
||||||
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
|
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
|
||||||
|
DEF_OP(CUSTOM_LETTERBOX)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (c) 2020 Vivante Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
#ifndef _VSI_NN_OP_CUSTOM_LETTERBOX_H
|
||||||
|
#define _VSI_NN_OP_CUSTOM_LETTERBOX_H
|
||||||
|
|
||||||
|
#include "vsi_nn_types.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef struct _vsi_nn_custom_letterbox_param
|
||||||
|
{
|
||||||
|
struct _custom_letterbox_local_data_t* local;
|
||||||
|
int32_t new_shape_w;
|
||||||
|
int32_t new_shape_h;
|
||||||
|
vx_bool auto_bool;
|
||||||
|
vx_bool scaleFill;
|
||||||
|
vx_bool scaleup;
|
||||||
|
int32_t stride;
|
||||||
|
vx_bool center;
|
||||||
|
float mean_r;
|
||||||
|
float mean_g;
|
||||||
|
float mean_b;
|
||||||
|
float scale_r;
|
||||||
|
float scale_g;
|
||||||
|
float scale_b;
|
||||||
|
int32_t pad_value_r;
|
||||||
|
int32_t pad_value_g;
|
||||||
|
int32_t pad_value_b;
|
||||||
|
vx_bool reverse_channel;
|
||||||
|
} vsi_nn_custom_letterbox_param;
|
||||||
|
_compiler_assert(offsetof(vsi_nn_custom_letterbox_param, local) == 0, \
|
||||||
|
vsi_nn_custom_lertterbox_h );
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -34,5 +34,6 @@
|
||||||
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
|
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
|
||||||
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
|
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
|
||||||
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
|
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
|
||||||
|
#include "custom/ops/vsi_nn_op_custom_letterbox.h"
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -203,3 +203,4 @@ DEF_OP(BITCAST)
|
||||||
DEF_OP(GROUPED_CONV3D)
|
DEF_OP(GROUPED_CONV3D)
|
||||||
DEF_OP(COL2IM)
|
DEF_OP(COL2IM)
|
||||||
DEF_OP(L1_LAYER_NORM)
|
DEF_OP(L1_LAYER_NORM)
|
||||||
|
DEF_OP(ROPE)
|
||||||
|
|
|
||||||
|
|
@ -80,7 +80,7 @@ typedef struct _vsi_nn_pre_process_rgb_param
|
||||||
float g_scale;
|
float g_scale;
|
||||||
float b_scale;
|
float b_scale;
|
||||||
/* pre process rgb layer local data structure */
|
/* pre process rgb layer local data structure */
|
||||||
vsi_nn_pre_process_rgb_lcl_data local;
|
vsi_nn_pre_process_rgb_lcl_data *local;
|
||||||
} vsi_nn_pre_process_rgb_param;
|
} vsi_nn_pre_process_rgb_param;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,49 @@
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (c) 2020 Vivante Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef _VSI_NN_OP_ROPE_H
|
||||||
|
#define _VSI_NN_OP_ROPE_H
|
||||||
|
|
||||||
|
#include "vsi_nn_types.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef struct _vsi_nn_rope_param
|
||||||
|
{
|
||||||
|
struct _rope_local_data_t* local;
|
||||||
|
// Add parameters here
|
||||||
|
int32_t axis;
|
||||||
|
vsi_bool interleaved;
|
||||||
|
} vsi_nn_rope_param;
|
||||||
|
_compiler_assert(offsetof(vsi_nn_rope_param, local) == 0, \
|
||||||
|
vsi_nn_rope_h );
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
@ -34,6 +34,7 @@ typedef struct _vsi_nn_topk_param
|
||||||
{
|
{
|
||||||
uint32_t k;
|
uint32_t k;
|
||||||
int32_t axis;
|
int32_t axis;
|
||||||
|
struct _topk_local_data_t* local;
|
||||||
} vsi_nn_topk_param;
|
} vsi_nn_topk_param;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
||||||
|
|
@ -384,25 +384,17 @@ static VSI_INLINE_API float fp16_to_fp32
|
||||||
|
|
||||||
static VSI_INLINE_API float bfp16_to_fp32
|
static VSI_INLINE_API float bfp16_to_fp32
|
||||||
(
|
(
|
||||||
int16_t in
|
uint16_t in
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
uint32_t t1, t2, t3;
|
|
||||||
float out;
|
float out;
|
||||||
fp32_bit_cast_t fp32_bit_cast;
|
fp32_bit_cast_t fp32_bit_cast;
|
||||||
|
|
||||||
t1 = in & 0x00FF; // Mantissa
|
fp32_bit_cast.data = (uint32_t)(in << 16);
|
||||||
t2 = in & 0xFF00; // Sign bit + Exponent
|
|
||||||
t3 = in & 0x7F00; // Exponent
|
|
||||||
|
|
||||||
t1 <<= 16;
|
|
||||||
t2 <<= 16; // Shift (sign + Exponent) bit into position
|
|
||||||
t1 |= t2; // Re-insert (sign + Exponent) bit
|
|
||||||
|
|
||||||
fp32_bit_cast.data = t1;
|
|
||||||
out = fp32_bit_cast.val;
|
out = fp32_bit_cast.val;
|
||||||
|
|
||||||
return t3 == 0 ? 0.0f : out;
|
return out;
|
||||||
} /* bfp16_to_fp32() */
|
} /* bfp16_to_fp32() */
|
||||||
|
|
||||||
static VSI_INLINE_API uint16_t fp32_to_fp16
|
static VSI_INLINE_API uint16_t fp32_to_fp16
|
||||||
|
|
@ -720,7 +712,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32
|
||||||
*dst = fp16_to_fp32( *(int16_t *)src );
|
*dst = fp16_to_fp32( *(int16_t *)src );
|
||||||
break;
|
break;
|
||||||
case VSI_NN_TYPE_BFLOAT16:
|
case VSI_NN_TYPE_BFLOAT16:
|
||||||
*dst = bfp16_to_fp32( *(int16_t *)src );
|
*dst = bfp16_to_fp32( *(uint16_t *)src );
|
||||||
break;
|
break;
|
||||||
case VSI_NN_TYPE_FLOAT8_E4M3:
|
case VSI_NN_TYPE_FLOAT8_E4M3:
|
||||||
*dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);
|
*dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -61,14 +61,13 @@ typedef struct _vsi_nn_hw_config_t
|
||||||
{
|
{
|
||||||
char target_name[VSI_NN_MAX_TARGET_NAME];
|
char target_name[VSI_NN_MAX_TARGET_NAME];
|
||||||
vsi_nn_hw_evis_t evis;
|
vsi_nn_hw_evis_t evis;
|
||||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
|
||||||
uint32_t subGroupSize;
|
uint32_t subGroupSize;
|
||||||
#endif
|
|
||||||
uint32_t use_40bits_va;
|
uint32_t use_40bits_va;
|
||||||
uint32_t support_stream_processor;
|
uint32_t support_stream_processor;
|
||||||
uint32_t sp_exec_count;
|
uint32_t sp_exec_count;
|
||||||
uint32_t sp_vector_depth;
|
uint32_t sp_vector_depth;
|
||||||
uint32_t sp_per_core_vector_depth;
|
uint32_t sp_per_core_vector_depth;
|
||||||
|
uint32_t support_ffd;
|
||||||
} vsi_nn_hw_config_t;
|
} vsi_nn_hw_config_t;
|
||||||
|
|
||||||
typedef struct _vsi_nn_runtime_option_t
|
typedef struct _vsi_nn_runtime_option_t
|
||||||
|
|
@ -89,6 +88,7 @@ typedef struct _vsi_nn_runtime_option_t
|
||||||
int32_t enable_save_file_type;
|
int32_t enable_save_file_type;
|
||||||
int32_t enable_use_image_process;
|
int32_t enable_use_image_process;
|
||||||
int32_t enable_use_from_handle;
|
int32_t enable_use_from_handle;
|
||||||
|
vsi_nn_hw_config_t config;
|
||||||
} vsi_nn_runtime_option_t;
|
} vsi_nn_runtime_option_t;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -101,6 +101,15 @@ typedef struct _vsi_nn_context_t
|
||||||
vsi_nn_runtime_option_t options;
|
vsi_nn_runtime_option_t options;
|
||||||
} VSI_PUBLIC_TYPE *vsi_nn_context_t;
|
} VSI_PUBLIC_TYPE *vsi_nn_context_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Query and set options->config hw params.
|
||||||
|
*/
|
||||||
|
OVXLIB_API vsi_status query_hardware_caps_runtime
|
||||||
|
(
|
||||||
|
vsi_nn_context_t ctx,
|
||||||
|
vsi_nn_runtime_option_t *options
|
||||||
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create context
|
* Create context
|
||||||
* Create ovxlib NN runtime context.
|
* Create ovxlib NN runtime context.
|
||||||
|
|
@ -113,6 +122,11 @@ OVXLIB_API vsi_status vsi_nn_initOptions
|
||||||
(
|
(
|
||||||
vsi_nn_runtime_option_t *options
|
vsi_nn_runtime_option_t *options
|
||||||
);
|
);
|
||||||
|
OVXLIB_API vsi_status vsi_nn_initOptions_runtime
|
||||||
|
(
|
||||||
|
vsi_nn_runtime_option_t *options,
|
||||||
|
vsi_nn_context_t ctx
|
||||||
|
);
|
||||||
/**
|
/**
|
||||||
* Release context
|
* Release context
|
||||||
* Release ovxlib NN runtime resource and reset context handle to NULL.
|
* Release ovxlib NN runtime resource and reset context handle to NULL.
|
||||||
|
|
|
||||||
|
|
@ -57,5 +57,8 @@
|
||||||
#define VSI_PER_GROUP_QUANTIZATION_SUPPORT
|
#define VSI_PER_GROUP_QUANTIZATION_SUPPORT
|
||||||
#endif
|
#endif
|
||||||
#define VSI_GRAPH_RUNTIME_ENV_SUPPORT
|
#define VSI_GRAPH_RUNTIME_ENV_SUPPORT
|
||||||
|
#if defined(VX_TENSOR_SPARSITY_SUPPORT)
|
||||||
|
#define VSI_TENSOR_SPARSITY_SUPPORT
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -216,6 +216,7 @@
|
||||||
#include "ops/vsi_nn_op_grouped_conv3d.h"
|
#include "ops/vsi_nn_op_grouped_conv3d.h"
|
||||||
#include "ops/vsi_nn_op_col2im.h"
|
#include "ops/vsi_nn_op_col2im.h"
|
||||||
#include "ops/vsi_nn_op_l1_layer_norm.h"
|
#include "ops/vsi_nn_op_l1_layer_norm.h"
|
||||||
|
#include "ops/vsi_nn_op_rope.h"
|
||||||
/* custom node head define define */
|
/* custom node head define define */
|
||||||
#include "custom/vsi_nn_custom_node_type.h"
|
#include "custom/vsi_nn_custom_node_type.h"
|
||||||
#include "ops/vsi_nn_op_inverse_sigmoid.h"
|
#include "ops/vsi_nn_op_inverse_sigmoid.h"
|
||||||
|
|
@ -420,6 +421,7 @@ typedef union _vsi_nn_nn_param
|
||||||
vsi_nn_grouped_conv3d_param grouped_conv3d;
|
vsi_nn_grouped_conv3d_param grouped_conv3d;
|
||||||
vsi_nn_col2im_param col2im;
|
vsi_nn_col2im_param col2im;
|
||||||
vsi_nn_l1_layer_norm_param l1_layer_norm;
|
vsi_nn_l1_layer_norm_param l1_layer_norm;
|
||||||
|
vsi_nn_rope_param rope;
|
||||||
void* client_param;
|
void* client_param;
|
||||||
|
|
||||||
/* custom node data struct define */
|
/* custom node data struct define */
|
||||||
|
|
|
||||||
|
|
@ -86,8 +86,10 @@ typedef enum
|
||||||
VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
|
VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
|
||||||
/** perchannel float8 */
|
/** perchannel float8 */
|
||||||
VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
|
VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
|
||||||
/** GPQT */
|
/** pergroup symmetric */
|
||||||
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
|
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
|
||||||
|
/** pergroup asymmetric */
|
||||||
|
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC = 0x9,
|
||||||
/** undefined type */
|
/** undefined type */
|
||||||
VSI_NN_QNT_TYPE_NA = 0xff,
|
VSI_NN_QNT_TYPE_NA = 0xff,
|
||||||
} vsi_nn_qnt_type_e;
|
} vsi_nn_qnt_type_e;
|
||||||
|
|
|
||||||
|
|
@ -418,6 +418,34 @@ OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar
|
||||||
int8_t is_scalar
|
int8_t is_scalar
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get Tensor is_scalar
|
||||||
|
* Get the is_sparsity of the tensor
|
||||||
|
*
|
||||||
|
* @param[in] tensor Tensor.
|
||||||
|
*
|
||||||
|
* @return is_sparsity flag of the tensor.
|
||||||
|
*/
|
||||||
|
OVXLIB_API int32_t vsi_nn_GetTensorIsSparsity
|
||||||
|
(
|
||||||
|
vsi_nn_tensor_t* tensor
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set Weight Tensor whether is sparsity
|
||||||
|
* Set the is_sparsity for the tensor
|
||||||
|
*
|
||||||
|
* @param[in] tensor Tensor.
|
||||||
|
* @param[in] new is_sparsity value of the tensor.
|
||||||
|
*
|
||||||
|
* @return VSI_SUCCESS on success, or error core otherwise.
|
||||||
|
**/
|
||||||
|
|
||||||
|
OVXLIB_API vsi_status vsi_nn_SetTensorIsSparsity(
|
||||||
|
vsi_nn_tensor_t* tensor,
|
||||||
|
int32_t is_sparsity
|
||||||
|
);
|
||||||
|
|
||||||
OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
|
OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
|
||||||
(
|
(
|
||||||
vsi_nn_graph_t* graph,
|
vsi_nn_graph_t* graph,
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,7 @@ extern "C"{
|
||||||
|
|
||||||
#define VSI_NN_VERSION_MAJOR 1
|
#define VSI_NN_VERSION_MAJOR 1
|
||||||
#define VSI_NN_VERSION_MINOR 2
|
#define VSI_NN_VERSION_MINOR 2
|
||||||
#define VSI_NN_VERSION_PATCH 14
|
#define VSI_NN_VERSION_PATCH 22
|
||||||
#define VSI_NN_VERSION \
|
#define VSI_NN_VERSION \
|
||||||
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,475 @@
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (c) 2020 Vivante Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_tensor.h"
|
||||||
|
#include "vsi_nn_graph.h"
|
||||||
|
#include "vsi_nn_log.h"
|
||||||
|
#include "vsi_nn_error.h"
|
||||||
|
#include "vsi_nn_prv.h"
|
||||||
|
#include "vsi_nn_tensor_util.h"
|
||||||
|
#include "utils/vsi_nn_util.h"
|
||||||
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
#include "utils/vsi_nn_dtype_util.h"
|
||||||
|
#include "utils/vsi_nn_dtype_util_prv.h"
|
||||||
|
|
||||||
|
__BEGIN_DECLS
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Define kernel meta.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define _CUSTOM_LETTERBOX_KERNEL_SOURCE "custom_letterbox"
|
||||||
|
|
||||||
|
// Add kernel hashtable here
|
||||||
|
#define CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||||
|
(( IN_DTYPE ) | ( OUT_DTYPE << 8 ))
|
||||||
|
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
|
||||||
|
CVIVANTE_NAMESPACE("evis.custom_letterbox_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||||
|
_CUSTOM_LETTERBOX_KERNEL_SOURCE }
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
uint32_t key;
|
||||||
|
char * function_name;
|
||||||
|
const char * source_name;
|
||||||
|
} _kernel_map_type;
|
||||||
|
|
||||||
|
static const _kernel_map_type _custom_letterbox_kernel_map[] =
|
||||||
|
{
|
||||||
|
// Register kernel here
|
||||||
|
PACK_KERNEL_MAP( U8, U8 ),
|
||||||
|
PACK_KERNEL_MAP( U8, I8 ),
|
||||||
|
PACK_KERNEL_MAP( U8, F16 ),
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Kernel params
|
||||||
|
*/
|
||||||
|
static vx_param_description_t _custom_letterbox_kernel_param_def[] =
|
||||||
|
{
|
||||||
|
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
};
|
||||||
|
|
||||||
|
#define _CUSTOM_LETTERBOX_PARAM_NUM _cnt_of_array( _custom_letterbox_kernel_param_def )
|
||||||
|
/*
|
||||||
|
* Kernel initializer
|
||||||
|
*/
|
||||||
|
DEF_KERNEL_INITIALIZER(_custom_letterbox_initializer)
|
||||||
|
(
|
||||||
|
vsi_nn_kernel_node_t node,
|
||||||
|
const vsi_nn_kernel_node_param_t * param,
|
||||||
|
size_t param_size
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
gpu_param_t gpu_param = {
|
||||||
|
2,
|
||||||
|
{0, 0, 0},
|
||||||
|
{0, 0, 0},
|
||||||
|
{0, 0, 0},
|
||||||
|
{0, 0, 0}
|
||||||
|
};
|
||||||
|
|
||||||
|
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||||
|
VSI_UNREFERENCED(param_size);
|
||||||
|
int32_t top = 0;
|
||||||
|
int32_t bottom = 0;
|
||||||
|
int32_t left = 0;
|
||||||
|
int32_t right = 0;
|
||||||
|
float scale_w = 0;
|
||||||
|
float scale_h = 0;
|
||||||
|
int32_t resize_w = 0;
|
||||||
|
int32_t resize_h = 0;
|
||||||
|
int32_t resize_max_w = 0;
|
||||||
|
int32_t resize_max_h = 0;
|
||||||
|
float output_scale = 1.0f;
|
||||||
|
float output_zp = 0;
|
||||||
|
float out_scale_r = 0;
|
||||||
|
float out_zp_r = 0;
|
||||||
|
float out_scale_g = 0;
|
||||||
|
float out_zp_g = 0;
|
||||||
|
float out_scale_b = 0;
|
||||||
|
float out_zp_b = 0;
|
||||||
|
float pad_v_r = 0;
|
||||||
|
float pad_v_g = 0;
|
||||||
|
float pad_v_b = 0;
|
||||||
|
int32_t in_width = 0;
|
||||||
|
int32_t in_height = 0;
|
||||||
|
int32_t out_width = 0;
|
||||||
|
int32_t out_height = 0;
|
||||||
|
float mean_r = 0;
|
||||||
|
float mean_g = 0;
|
||||||
|
float mean_b = 0;
|
||||||
|
float scale_r = 0;
|
||||||
|
float scale_g = 0;
|
||||||
|
float scale_b = 0;
|
||||||
|
vx_int32 pad_value_r = 0;
|
||||||
|
vx_int32 pad_value_g = 0;
|
||||||
|
vx_int32 pad_value_b = 0;
|
||||||
|
vx_int32 r_order = 0;
|
||||||
|
vx_int32 b_order = 0;
|
||||||
|
vx_int32 reverse_channel = 0;
|
||||||
|
|
||||||
|
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||||
|
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||||
|
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||||
|
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &top);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &bottom);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &left);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &right);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &mean_r);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &mean_g);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &mean_b);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &scale_r);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &scale_g);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[11], &scale_b);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_value_r);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_value_g);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &pad_value_b);
|
||||||
|
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &reverse_channel);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||||
|
|
||||||
|
in_width = (int32_t)attr[0]->shape->data[0] / 3;
|
||||||
|
in_height = (int32_t)attr[0]->shape->data[1];
|
||||||
|
out_width = (int32_t)attr[1]->shape->data[0];
|
||||||
|
out_height = (int32_t)attr[1]->shape->data[1] / 3;
|
||||||
|
|
||||||
|
output_scale = 1.0f / attr[1]->scale;
|
||||||
|
output_zp = (float)(attr[1]->zero_point);
|
||||||
|
|
||||||
|
resize_w = out_width - left - right;
|
||||||
|
resize_h = out_height - top - bottom;
|
||||||
|
resize_max_w = out_width - right;
|
||||||
|
resize_max_h = out_height - bottom;
|
||||||
|
scale_w = (float)in_width / resize_w;
|
||||||
|
scale_h = (float)in_height / resize_h;
|
||||||
|
out_scale_r = scale_r / output_scale;
|
||||||
|
out_zp_r = output_zp - out_scale_r * mean_r;
|
||||||
|
out_scale_g = scale_g / output_scale;
|
||||||
|
out_zp_g = output_zp - out_scale_g * mean_g;
|
||||||
|
out_scale_b = scale_b / output_scale;
|
||||||
|
out_zp_b = output_zp - out_scale_b * mean_b;
|
||||||
|
pad_v_r = pad_value_r * out_scale_r + out_zp_r;
|
||||||
|
pad_v_g = pad_value_g * out_scale_g + out_zp_g;
|
||||||
|
pad_v_b = pad_value_b * out_scale_b + out_zp_b;
|
||||||
|
|
||||||
|
if (reverse_channel)
|
||||||
|
{
|
||||||
|
r_order = out_height * 2;
|
||||||
|
b_order = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
r_order = 0;
|
||||||
|
b_order = out_height * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
|
||||||
|
0x00090909, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00140003, 0x00000025, // ABin
|
||||||
|
0x000a0a0a, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||||
|
0x00010001, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniLeftToFloat32_4x4 = {{
|
||||||
|
0x00010101, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00010000, 0x00000002, // ABin
|
||||||
|
0x00020202, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||||
|
0x00000001, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniExtactHalf8_2x8 = {{
|
||||||
|
0x11111111, // TCfg
|
||||||
|
0x11110000, // ASelt
|
||||||
|
0x06040200, 0x06040200, // ABin
|
||||||
|
0x22222222, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00000100, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||||
|
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniExtract8Data_2x8 = {{
|
||||||
|
0x33333333, // TCfg
|
||||||
|
0x11110000, // ASelt
|
||||||
|
0x03020100, 0x03020100, // ABin
|
||||||
|
0x00000000, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00002400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4 );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFloat32_4x4", &uniLeftToFloat32_4x4 );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8 );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8 );
|
||||||
|
}
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "top", &top );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "left", &left );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_r", &out_scale_r );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_g", &out_scale_g );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_b", &out_scale_b );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_r", &out_zp_r );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_g", &out_zp_g );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_b", &out_zp_b );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_r", &pad_v_r );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_g", &pad_v_g );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_b", &pad_v_b );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "scale_w", &scale_w );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "scale_h", &scale_h );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_w", &resize_max_w );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_h", &resize_max_h );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "r_order", &r_order );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param( node, "b_order", &b_order );
|
||||||
|
|
||||||
|
gpu_param.global_scale[0] = 1;
|
||||||
|
gpu_param.global_scale[1] = 1;
|
||||||
|
gpu_param.global_size[0] = out_width;
|
||||||
|
gpu_param.global_size[1] = out_height;
|
||||||
|
|
||||||
|
status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||||
|
|
||||||
|
final:
|
||||||
|
if (attr[0])
|
||||||
|
{
|
||||||
|
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||||
|
attr[0] = NULL;
|
||||||
|
}
|
||||||
|
if (attr[1])
|
||||||
|
{
|
||||||
|
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||||
|
attr[1] = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return status;
|
||||||
|
} /* _custom_warp_affine_initializer() */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Query kernel
|
||||||
|
*/
|
||||||
|
static vsi_status _query_kernel
|
||||||
|
(
|
||||||
|
vsi_nn_kernel_t * kernel,
|
||||||
|
vsi_nn_tensor_t * const * const inputs,
|
||||||
|
vsi_nn_tensor_t * const * const outputs
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
vsi_nn_kernel_dtype_e in_dtype;
|
||||||
|
vsi_nn_kernel_dtype_e out_dtype;
|
||||||
|
const _kernel_map_type * kernel_map = _custom_letterbox_kernel_map;
|
||||||
|
size_t kernel_map_size = _cnt_of_array( _custom_letterbox_kernel_map );
|
||||||
|
vx_param_description_t * param_def = _custom_letterbox_kernel_param_def;
|
||||||
|
size_t param_def_size = _cnt_of_array( _custom_letterbox_kernel_param_def );
|
||||||
|
vx_kernel_initialize_f initializer = _custom_letterbox_initializer;
|
||||||
|
uint32_t key = 0;
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||||
|
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||||
|
|
||||||
|
key = CUSTOM_LETTERBOX_HASH_KEY( in_dtype, out_dtype );
|
||||||
|
|
||||||
|
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||||
|
{
|
||||||
|
if ( kernel_map[i].key == key )
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( i < (uint32_t)kernel_map_size )
|
||||||
|
{
|
||||||
|
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||||
|
kernel->info.parameters = param_def;
|
||||||
|
kernel->info.numParams = (vx_uint32)param_def_size;
|
||||||
|
kernel->info.initialize = initializer;
|
||||||
|
// Register code source
|
||||||
|
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||||
|
"vsi_nn_kernel_header",
|
||||||
|
kernel_map[i].source_name );
|
||||||
|
// Register binary source
|
||||||
|
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||||
|
kernel_map[i].source_name );
|
||||||
|
status = VSI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
return status;
|
||||||
|
} /* _query_kernel() */
|
||||||
|
|
||||||
|
|
||||||
|
static vsi_nn_kernel_node_t _setup
|
||||||
|
(
|
||||||
|
vsi_nn_graph_t * graph,
|
||||||
|
vsi_nn_tensor_t ** inputs,
|
||||||
|
size_t input_num,
|
||||||
|
vsi_nn_tensor_t ** outputs,
|
||||||
|
size_t output_num,
|
||||||
|
const vsi_nn_kernel_param_t * params,
|
||||||
|
vsi_nn_kernel_t * kernel
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
vsi_nn_kernel_node_param_t node_params[_CUSTOM_LETTERBOX_PARAM_NUM];
|
||||||
|
vsi_nn_kernel_node_t node = NULL;
|
||||||
|
size_t i = 0;
|
||||||
|
|
||||||
|
int32_t top = vsi_nn_kernel_param_get_int32( params, "top");
|
||||||
|
int32_t bottom = vsi_nn_kernel_param_get_int32( params, "bottom");
|
||||||
|
int32_t left = vsi_nn_kernel_param_get_int32( params, "left");
|
||||||
|
int32_t right = vsi_nn_kernel_param_get_int32( params, "right");
|
||||||
|
float mean_r = vsi_nn_kernel_param_get_float32( params, "mean_r");
|
||||||
|
float mean_g = vsi_nn_kernel_param_get_float32( params, "mean_g");
|
||||||
|
float mean_b = vsi_nn_kernel_param_get_float32( params, "mean_b");
|
||||||
|
float scale_r = vsi_nn_kernel_param_get_float32( params, "scale_r");
|
||||||
|
float scale_g = vsi_nn_kernel_param_get_float32( params, "scale_g");
|
||||||
|
float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b");
|
||||||
|
int32_t pad_value_r = vsi_nn_kernel_param_get_int32( params, "pad_value_r");
|
||||||
|
int32_t pad_value_g = vsi_nn_kernel_param_get_int32( params, "pad_value_g");
|
||||||
|
int32_t pad_value_b = vsi_nn_kernel_param_get_int32( params, "pad_value_b");
|
||||||
|
int32_t reverse_channel = vsi_nn_kernel_param_get_int32( params, "reverse_channel");
|
||||||
|
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||||
|
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||||
|
|
||||||
|
uint32_t param_num = _CUSTOM_LETTERBOX_PARAM_NUM;
|
||||||
|
VSI_UNREFERENCED(input_num);
|
||||||
|
VSI_UNREFERENCED(output_num);
|
||||||
|
shapes[0][0] = inputs[0]->attr.size[1] * 3;
|
||||||
|
shapes[0][1] = inputs[0]->attr.size[2];
|
||||||
|
shapes[1][0] = outputs[0]->attr.size[0];
|
||||||
|
shapes[1][1] = outputs[0]->attr.size[1] * 3;
|
||||||
|
|
||||||
|
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||||
|
inputs[0], shapes[0], 2 );
|
||||||
|
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||||
|
outputs[0], shapes[1], 2 );
|
||||||
|
|
||||||
|
if (reshape_tensors[0] == NULL ||
|
||||||
|
reshape_tensors[1] == NULL)
|
||||||
|
{
|
||||||
|
goto final;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reverse_channel)
|
||||||
|
{
|
||||||
|
float mean_temp = mean_r;
|
||||||
|
float scale_temp = scale_r;
|
||||||
|
int32_t pad_value_temp = pad_value_r;
|
||||||
|
mean_r = mean_b;
|
||||||
|
mean_b = mean_temp;
|
||||||
|
scale_r = scale_b;
|
||||||
|
scale_b = scale_temp;
|
||||||
|
pad_value_r = pad_value_b;
|
||||||
|
pad_value_b = pad_value_temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
status = _query_kernel( kernel, inputs, outputs );
|
||||||
|
if ( VSI_SUCCESS == status)
|
||||||
|
{
|
||||||
|
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||||
|
if ( node )
|
||||||
|
{
|
||||||
|
uint32_t index = 2;
|
||||||
|
|
||||||
|
vsi_nn_kernel_node_pack_io( node_params, param_num,
|
||||||
|
reshape_tensors, 1, &reshape_tensors[1], 1 );
|
||||||
|
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bottom );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &right );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_r );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_g );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_b );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_r );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_g );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_r );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_g );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_b );
|
||||||
|
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse_channel );
|
||||||
|
|
||||||
|
/* Pass parameters to node. */
|
||||||
|
status = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[12] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[13] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[14] );
|
||||||
|
vsi_nn_kernel_scalar_release( &node_params[15] );
|
||||||
|
|
||||||
|
CHECK_STATUS(status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final:
|
||||||
|
for (i = 0; i < 2; i++)
|
||||||
|
{
|
||||||
|
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return node;
|
||||||
|
} /* _setup() */
|
||||||
|
|
||||||
|
__END_DECLS
|
||||||
|
|
||||||
|
REGISTER_BACKEND_EVIS( custom_letterbox, _setup )
|
||||||
|
|
@ -35,6 +35,7 @@
|
||||||
#include "utils/vsi_nn_dtype_util.h"
|
#include "utils/vsi_nn_dtype_util.h"
|
||||||
#include "kernel/vsi_nn_kernel.h"
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
#include "libnnext/vsi_nn_vxkernel.h"
|
#include "libnnext/vsi_nn_vxkernel.h"
|
||||||
|
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||||
|
|
||||||
#define _CPU_ARG_NUM (1)
|
#define _CPU_ARG_NUM (1)
|
||||||
#define _CPU_INPUT_NUM (1)
|
#define _CPU_INPUT_NUM (1)
|
||||||
|
|
@ -42,6 +43,7 @@
|
||||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||||
#define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC")
|
#define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC")
|
||||||
|
#define _KERNEL_NAME_U8 ("com.vivantecorp.extension.Softmax2VXC_u8")
|
||||||
|
|
||||||
#define SCALAR_INPUT_AXIS (2)
|
#define SCALAR_INPUT_AXIS (2)
|
||||||
|
|
||||||
|
|
@ -64,7 +66,11 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
|
||||||
{
|
{
|
||||||
vsi_status status = VSI_FAILURE;
|
vsi_status status = VSI_FAILURE;
|
||||||
int sf_size = 0;
|
int sf_size = 0;
|
||||||
vsi_nn_kernel_tensor_attr_t* attr = NULL;
|
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||||
|
float srcZP = 0.0f;
|
||||||
|
float srcScale = 1.0f;
|
||||||
|
float dstZP = 0.0f;
|
||||||
|
float dstScale = 1.0f;
|
||||||
// Alignment with a power of two value.
|
// Alignment with a power of two value.
|
||||||
gpu_param_t gpu_param = {
|
gpu_param_t gpu_param = {
|
||||||
2, // workdim
|
2, // workdim
|
||||||
|
|
@ -75,14 +81,19 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
|
||||||
|
|
||||||
VSI_UNREFERENCED(param_size);
|
VSI_UNREFERENCED(param_size);
|
||||||
|
|
||||||
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
attr[0] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
|
||||||
if (!attr)
|
attr[1] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
|
||||||
|
if ((!attr[0]) || (!attr[1]))
|
||||||
{
|
{
|
||||||
VSILOGE("Query failure! at line");
|
VSILOGE("Query failure! at line");
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
sf_size = (int)attr->shape->data[0];
|
sf_size = (int)attr[0]->shape->data[0];
|
||||||
|
srcScale = attr[0]->scale;
|
||||||
|
srcZP = (float)attr[0]->zero_point;
|
||||||
|
dstScale = 1.0f / attr[1]->scale;
|
||||||
|
dstZP = (float)attr[1]->zero_point;
|
||||||
|
|
||||||
gpu_param.global_offset[0] = 0;
|
gpu_param.global_offset[0] = 0;
|
||||||
gpu_param.global_offset[1] = 0;
|
gpu_param.global_offset[1] = 0;
|
||||||
|
|
@ -91,7 +102,7 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
|
||||||
gpu_param.local_size[0] = 1;
|
gpu_param.local_size[0] = 1;
|
||||||
gpu_param.local_size[1] = 1;
|
gpu_param.local_size[1] = 1;
|
||||||
gpu_param.global_size[0] =
|
gpu_param.global_size[0] =
|
||||||
gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
|
gpu_align_p2((attr[0]->shape->data[1] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
|
||||||
gpu_param.local_size[0]);
|
gpu_param.local_size[0]);
|
||||||
gpu_param.global_size[1] =
|
gpu_param.global_size[1] =
|
||||||
gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
|
gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
|
||||||
|
|
@ -107,25 +118,50 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
|
||||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||||
}, GPU_DP_TYPE_16};
|
}, GPU_DP_TYPE_16};
|
||||||
|
gpu_dp_inst_t uniExtract8Bin_2x8 = {{
|
||||||
|
0x11111111, // TCfg
|
||||||
|
0x11110000, // ASelt
|
||||||
|
0x06040200, 0x06040200, // ABin
|
||||||
|
0x22222222, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00002400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||||
|
}, GPU_DP_TYPE_16};
|
||||||
|
|
||||||
status = vsi_nn_kernel_gpu_add_param( node,
|
status = vsi_nn_kernel_gpu_add_param( node,
|
||||||
"Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
|
"Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
|
||||||
vsi_nn_kernel_gpu_add_param(node,
|
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||||
|
"uniExtract8Bin_2x8", &uniExtract8Bin_2x8 );
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
"sf_size", &sf_size);
|
"sf_size", &sf_size);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "srcScale", &srcScale);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "srcZP", &srcZP);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "dstScale", &dstScale);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "dstZP", &dstZP);
|
||||||
}
|
}
|
||||||
|
|
||||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||||
|
|
||||||
if(status != VSI_SUCCESS)
|
if(status != VSI_SUCCESS)
|
||||||
{
|
{
|
||||||
VSILOGE("Initializer failure!");
|
VSILOGE("Initializer failure!");
|
||||||
}
|
}
|
||||||
if (attr) vsi_nn_kernel_tensor_attr_release( &attr );
|
if (attr[0])
|
||||||
|
{
|
||||||
|
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||||
|
attr[0] = NULL;
|
||||||
|
}
|
||||||
|
if (attr[1])
|
||||||
|
{
|
||||||
|
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||||
|
attr[1] = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const vx_kernel_description_t _kernel_info =
|
static const vx_kernel_description_t _kernel_info1 =
|
||||||
{
|
{
|
||||||
KERNEL_ID_PLACEHOLDER,
|
KERNEL_ID_PLACEHOLDER,
|
||||||
_KERNEL_NAME,
|
_KERNEL_NAME,
|
||||||
|
|
@ -139,6 +175,20 @@ static const vx_kernel_description_t _kernel_info =
|
||||||
vsi_nn_KernelDeinitializer
|
vsi_nn_KernelDeinitializer
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const vx_kernel_description_t _kernel_info2 =
|
||||||
|
{
|
||||||
|
KERNEL_ID_PLACEHOLDER,
|
||||||
|
_KERNEL_NAME_U8,
|
||||||
|
NULL,
|
||||||
|
kernel_param_def,
|
||||||
|
_cnt_of_array( kernel_param_def ),
|
||||||
|
vsi_nn_KernelValidator,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
_softmax_initializer,
|
||||||
|
vsi_nn_KernelDeinitializer
|
||||||
|
};
|
||||||
|
|
||||||
static vsi_status _query_kernel
|
static vsi_status _query_kernel
|
||||||
(
|
(
|
||||||
vsi_nn_tensor_t* const* const inputs,
|
vsi_nn_tensor_t* const* const inputs,
|
||||||
|
|
@ -146,9 +196,20 @@ static vsi_status _query_kernel
|
||||||
vsi_nn_kernel_t* kernel
|
vsi_nn_kernel_t* kernel
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
VSI_UNREFERENCED(inputs);
|
vsi_nn_kernel_dtype_e in_dtype;
|
||||||
VSI_UNREFERENCED(outputs);
|
vsi_nn_kernel_dtype_e out_dtype;
|
||||||
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
|
|
||||||
|
in_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
|
||||||
|
out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
|
||||||
|
|
||||||
|
if (in_dtype == U8 && out_dtype == U8)
|
||||||
|
{
|
||||||
|
memmove( &kernel->info, &_kernel_info2, sizeof(vx_kernel_description_t) );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
memmove( &kernel->info, &_kernel_info1, sizeof(vx_kernel_description_t) );
|
||||||
|
}
|
||||||
|
|
||||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||||
"vsi_nn_kernel_header",
|
"vsi_nn_kernel_header",
|
||||||
|
|
@ -173,12 +234,42 @@ static vsi_nn_kernel_node_t _setup
|
||||||
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
||||||
vsi_nn_kernel_node_t node = NULL;
|
vsi_nn_kernel_node_t node = NULL;
|
||||||
int32_t axis = 0;
|
int32_t axis = 0;
|
||||||
|
vsi_nn_tensor_t* reshape_tensors[2] = {NULL};
|
||||||
|
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||||
|
uint32_t rank_in = 0;
|
||||||
|
int32_t new_axis = 0;
|
||||||
|
uint32_t i = 0;
|
||||||
|
vsi_bool ret = vx_false_e;
|
||||||
|
|
||||||
VSI_UNREFERENCED(input_num);
|
VSI_UNREFERENCED(input_num);
|
||||||
VSI_UNREFERENCED(output_num);
|
VSI_UNREFERENCED(output_num);
|
||||||
|
|
||||||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||||
|
|
||||||
|
ret = vsi_nn_kernel_optimize_softmax_shape(inputs[0]->attr.size,
|
||||||
|
inputs[0]->attr.dim_num,
|
||||||
|
axis,
|
||||||
|
shapes[0],
|
||||||
|
&rank_in,
|
||||||
|
&new_axis);
|
||||||
|
|
||||||
|
if (ret)
|
||||||
|
{
|
||||||
|
reshape_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], rank_in);
|
||||||
|
reshape_tensors[1] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[0], rank_in);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!vsi_nn_kernel_gpu_check_shape(reshape_tensors[0]->attr.size,
|
||||||
|
reshape_tensors[0]->attr.dim_num) ||
|
||||||
|
new_axis > 2)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
status = _query_kernel( inputs, outputs, kernel );
|
status = _query_kernel( inputs, outputs, kernel );
|
||||||
if( VSI_SUCCESS == status)
|
if( VSI_SUCCESS == status)
|
||||||
{
|
{
|
||||||
|
|
@ -187,9 +278,9 @@ static vsi_nn_kernel_node_t _setup
|
||||||
{
|
{
|
||||||
/* Set inputs and outputs */
|
/* Set inputs and outputs */
|
||||||
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
|
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
|
||||||
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
|
reshape_tensors, _CPU_INPUT_NUM, &reshape_tensors[1], _CPU_OUTPUT_NUM );
|
||||||
backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
|
backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
|
||||||
graph, I32, &axis );
|
graph, I32, &new_axis );
|
||||||
|
|
||||||
/* Pass parameters to node. */
|
/* Pass parameters to node. */
|
||||||
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
||||||
|
|
@ -200,6 +291,11 @@ static vsi_nn_kernel_node_t _setup
|
||||||
status = VSI_FAILURE;
|
status = VSI_FAILURE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < 2; i++)
|
||||||
|
{
|
||||||
|
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||||
|
}
|
||||||
return node;
|
return node;
|
||||||
} /* _setup() */
|
} /* _setup() */
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,227 @@
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (c) 2020 Vivante Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_log.h"
|
||||||
|
#include "vsi_nn_node.h"
|
||||||
|
#include "vsi_nn_prv.h"
|
||||||
|
#include "vsi_nn_ops.h"
|
||||||
|
#include "vsi_nn_tensor.h"
|
||||||
|
#include "utils/vsi_nn_util.h"
|
||||||
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
#include "vsi_nn_internal_node.h"
|
||||||
|
#include "utils/vsi_nn_constraint_check.h"
|
||||||
|
|
||||||
|
typedef struct _custom_letterbox_local_data_t {
|
||||||
|
int32_t placeholder;
|
||||||
|
} custom_letterbox_local_data_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
Declare number of input and output.
|
||||||
|
*/
|
||||||
|
#define _INPUT_NUM (1)
|
||||||
|
#define _OUTPUT_NUM (1)
|
||||||
|
|
||||||
|
int32_t my_round(float in)
|
||||||
|
{
|
||||||
|
if (in >= 0)
|
||||||
|
{
|
||||||
|
return (int)(in + 0.5f);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return (int)(in - 0.5f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static vsi_status op_compute
|
||||||
|
(
|
||||||
|
vsi_nn_node_t * self,
|
||||||
|
vsi_nn_tensor_t ** inputs,
|
||||||
|
vsi_nn_tensor_t ** outputs
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_nn_kernel_param_t * param = NULL;
|
||||||
|
vsi_nn_custom_letterbox_param * p;
|
||||||
|
p = &(self->nn_param.custom_letterbox);
|
||||||
|
int32_t shape_w = (int32_t)inputs[0]->attr.size[1];
|
||||||
|
int32_t shape_h = (int32_t)inputs[0]->attr.size[2];
|
||||||
|
int32_t new_shape_w = (int32_t)outputs[0]->attr.size[0];
|
||||||
|
int32_t new_shape_h = (int32_t)outputs[0]->attr.size[1];
|
||||||
|
vx_bool auto_bool = p->auto_bool;
|
||||||
|
vx_bool scaleFill = p->scaleFill;
|
||||||
|
vx_bool scaleup = p->scaleup;
|
||||||
|
int32_t stride = p->stride;
|
||||||
|
vx_bool center = p->center;
|
||||||
|
|
||||||
|
float r = 1.0f;
|
||||||
|
int32_t new_unpad_w = 0;
|
||||||
|
int32_t new_unpad_h = 0;
|
||||||
|
int32_t dw = 0;
|
||||||
|
int32_t dh = 0;
|
||||||
|
int32_t top = 0;
|
||||||
|
int32_t bottom = 0;
|
||||||
|
int32_t left = 0;
|
||||||
|
int32_t right = 0;
|
||||||
|
|
||||||
|
r = (float)fmin((float)new_shape_w / shape_w, (float)new_shape_h / shape_h);
|
||||||
|
if (!scaleup)
|
||||||
|
{
|
||||||
|
r = (float)fmin(r, 1.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
new_unpad_w = my_round(r * shape_w);
|
||||||
|
new_unpad_h = my_round(r * shape_h);
|
||||||
|
dw = new_shape_w - new_unpad_w;
|
||||||
|
dh = new_shape_h - new_unpad_h;
|
||||||
|
if (auto_bool)
|
||||||
|
{
|
||||||
|
dw = dw % stride;
|
||||||
|
dh = dh % stride;
|
||||||
|
}
|
||||||
|
else if (scaleFill)
|
||||||
|
{
|
||||||
|
dw = 0;
|
||||||
|
dh = 0;
|
||||||
|
new_unpad_w = new_shape_w;
|
||||||
|
new_unpad_h = new_shape_h;
|
||||||
|
}
|
||||||
|
if (center)
|
||||||
|
{
|
||||||
|
top = my_round(dh / 2.0f - 0.1f);
|
||||||
|
bottom = my_round(dh / 2.0f + 0.1f);
|
||||||
|
left = my_round(dw / 2.0f - 0.1f);
|
||||||
|
right = my_round(dw / 2.0f + 0.1f);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
top = 0;
|
||||||
|
bottom = my_round(dh + 0.1f);
|
||||||
|
left = 0;
|
||||||
|
right = my_round(dw + 0.1f);
|
||||||
|
}
|
||||||
|
|
||||||
|
param = vsi_nn_kernel_param_create();
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "top", top);
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "bottom", bottom);
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "left", left);
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "right", right);
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "mean_r", p->mean_r);
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "mean_g", p->mean_g);
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "mean_b", p->mean_b);
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "scale_r", p->scale_r);
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "scale_g", p->scale_g);
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "scale_b", p->scale_b);
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "pad_value_r", p->pad_value_r);
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "pad_value_g", p->pad_value_g);
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "pad_value_b", p->pad_value_b);
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "reverse_channel", p->reverse_channel);
|
||||||
|
|
||||||
|
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
|
||||||
|
"custom_letterbox",
|
||||||
|
inputs, 1,
|
||||||
|
outputs, 1, param );
|
||||||
|
|
||||||
|
vsi_nn_kernel_param_release( ¶m );
|
||||||
|
|
||||||
|
return VSI_SUCCESS;
|
||||||
|
} /* op_compute() */
|
||||||
|
|
||||||
|
static vsi_bool op_check
|
||||||
|
(
|
||||||
|
vsi_nn_node_t * self,
|
||||||
|
vsi_nn_tensor_t ** inputs,
|
||||||
|
vsi_nn_tensor_t ** outputs
|
||||||
|
)
|
||||||
|
{
|
||||||
|
BEGIN_IO_TYPE_DECL(LETTERBOX, 1, 1)
|
||||||
|
IO_TYPE(D_U8, D_F16)
|
||||||
|
IO_TYPE(D_U8, D_U8|Q_ASYM)
|
||||||
|
IO_TYPE(D_U8, D_I8|Q_DFP)
|
||||||
|
IO_TYPE(D_U8, D_I8|Q_ASYM)
|
||||||
|
IO_TYPE(D_U8, D_I8|Q_SYM)
|
||||||
|
END_IO_TYPE_DECL(LETTERBOX)
|
||||||
|
if (!VALIDATE_OP_IO_TYPES(LETTERBOX, self, inputs, self->input.num, outputs, self->output.num)) {
|
||||||
|
char* desc = generate_op_io_types_desc(inputs,
|
||||||
|
self->input.num, outputs, self->output.num);
|
||||||
|
VSILOGE("Inputs/Outputs data type not support: %s", desc);
|
||||||
|
destroy_op_io_types_desc(desc);
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
} /* op_check() */
|
||||||
|
|
||||||
|
static vsi_bool op_setup
|
||||||
|
(
|
||||||
|
vsi_nn_node_t * self,
|
||||||
|
vsi_nn_tensor_t ** inputs,
|
||||||
|
vsi_nn_tensor_t ** outputs
|
||||||
|
)
|
||||||
|
{
|
||||||
|
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||||
|
{
|
||||||
|
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
|
||||||
|
outputs[0]->attr.size[0] = self->nn_param.custom_letterbox.new_shape_w;
|
||||||
|
outputs[0]->attr.size[1] = self->nn_param.custom_letterbox.new_shape_h;
|
||||||
|
outputs[0]->attr.size[2] = 3;
|
||||||
|
outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
} /* op_setup() */
|
||||||
|
|
||||||
|
static vsi_status op_deinit
|
||||||
|
(
|
||||||
|
vsi_nn_node_t* self
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_SUCCESS;
|
||||||
|
|
||||||
|
status = vsi_nn_op_common_deinit(self);
|
||||||
|
|
||||||
|
return status;
|
||||||
|
} /* op_deinit() */
|
||||||
|
|
||||||
|
__BEGIN_DECLS
|
||||||
|
|
||||||
|
/* Registrar */
|
||||||
|
DEF_OP_REG
|
||||||
|
(
|
||||||
|
/* op_name */ CUSTOM_LETTERBOX,
|
||||||
|
/* init */ NULL,
|
||||||
|
/* compute */ op_compute,
|
||||||
|
/* deinit */ op_deinit,
|
||||||
|
/* check */ op_check,
|
||||||
|
/* setup */ op_setup,
|
||||||
|
/* optimize */ NULL,
|
||||||
|
/* input_num */ _INPUT_NUM,
|
||||||
|
/* output_num */ _OUTPUT_NUM
|
||||||
|
);
|
||||||
|
|
||||||
|
__END_DECLS
|
||||||
|
|
@ -85,18 +85,24 @@ static const struct {
|
||||||
HASH_CUMSUM_KERNELS(0, U8, U8)
|
HASH_CUMSUM_KERNELS(0, U8, U8)
|
||||||
HASH_CUMSUM_KERNELS(0, F32, F32)
|
HASH_CUMSUM_KERNELS(0, F32, F32)
|
||||||
HASH_CUMSUM_KERNELS(0, F32, U8)
|
HASH_CUMSUM_KERNELS(0, F32, U8)
|
||||||
|
HASH_CUMSUM_KERNELS(0, I32, I32)
|
||||||
HASH_CUMSUM_KERNELS(1, U8, U8)
|
HASH_CUMSUM_KERNELS(1, U8, U8)
|
||||||
HASH_CUMSUM_KERNELS(1, F32, F32)
|
HASH_CUMSUM_KERNELS(1, F32, F32)
|
||||||
HASH_CUMSUM_KERNELS(1, F32, U8)
|
HASH_CUMSUM_KERNELS(1, F32, U8)
|
||||||
|
HASH_CUMSUM_KERNELS(1, I32, I32)
|
||||||
HASH_CUMSUM_KERNELS(2, U8, U8)
|
HASH_CUMSUM_KERNELS(2, U8, U8)
|
||||||
HASH_CUMSUM_KERNELS(2, F32, F32)
|
HASH_CUMSUM_KERNELS(2, F32, F32)
|
||||||
HASH_CUMSUM_KERNELS(2, F32, U8)
|
HASH_CUMSUM_KERNELS(2, F32, U8)
|
||||||
|
HASH_CUMSUM_KERNELS(2, I32, I32)
|
||||||
|
|
||||||
HASH_CUMSUM_KERNELS_2D(0, U8, U8)
|
HASH_CUMSUM_KERNELS_2D(0, U8, U8)
|
||||||
HASH_CUMSUM_KERNELS_2D(0, F32, F32)
|
HASH_CUMSUM_KERNELS_2D(0, F32, F32)
|
||||||
HASH_CUMSUM_KERNELS_2D(0, F32, U8)
|
HASH_CUMSUM_KERNELS_2D(0, F32, U8)
|
||||||
|
HASH_CUMSUM_KERNELS_2D(0, I32, I32)
|
||||||
HASH_CUMSUM_KERNELS_2D(1, U8, U8)
|
HASH_CUMSUM_KERNELS_2D(1, U8, U8)
|
||||||
HASH_CUMSUM_KERNELS_2D(1, F32, F32)
|
HASH_CUMSUM_KERNELS_2D(1, F32, F32)
|
||||||
HASH_CUMSUM_KERNELS_2D(1, F32, U8)
|
HASH_CUMSUM_KERNELS_2D(1, F32, U8)
|
||||||
|
HASH_CUMSUM_KERNELS_2D(1, I32, I32)
|
||||||
|
|
||||||
HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_3)
|
HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_3)
|
||||||
HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
|
HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_tensor.h"
|
#include "vsi_nn_tensor.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
|
|
@ -644,7 +645,8 @@ static vsi_nn_kernel_node_t _setup
|
||||||
|
|
||||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||||
shader_cnt_support =
|
shader_cnt_support =
|
||||||
(graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
|
(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >= 64 &&
|
||||||
|
((vsi_nn_graph_prv_t*)graph)->options->config.use_40bits_va) ? TRUE : FALSE;
|
||||||
#endif
|
#endif
|
||||||
if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
|
if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,7 @@ static const _kernel_map_type _one_hot_kernel_map[] =
|
||||||
PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
|
PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
|
||||||
PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
|
PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
|
||||||
PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
|
PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
|
||||||
|
PACK_ONE_HOT_KERNEL_MAP( I32, BF16 ),
|
||||||
PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
|
PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
|
||||||
PACK_ONE_HOT_KERNEL_MAP( U8, U8 ),
|
PACK_ONE_HOT_KERNEL_MAP( U8, U8 ),
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -79,7 +79,7 @@ static const struct {
|
||||||
const char* source_name;
|
const char* source_name;
|
||||||
} kernel_map[] =
|
} kernel_map[] =
|
||||||
{
|
{
|
||||||
PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
|
PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
|
||||||
PRELU_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
|
PRELU_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
|
||||||
PRELU_KERNELS(U8, U8, U8, KERNEL_SOURCE_1)
|
PRELU_KERNELS(U8, U8, U8, KERNEL_SOURCE_1)
|
||||||
PRELU_KERNELS(I32, I32, I32, KERNEL_SOURCE_1)
|
PRELU_KERNELS(I32, I32, I32, KERNEL_SOURCE_1)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,329 @@
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (c) 2020 Vivante Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_tensor.h"
|
||||||
|
#include "vsi_nn_graph.h"
|
||||||
|
#include "vsi_nn_log.h"
|
||||||
|
#include "vsi_nn_error.h"
|
||||||
|
#include "vsi_nn_prv.h"
|
||||||
|
#include "vsi_nn_tensor_util.h"
|
||||||
|
#include "utils/vsi_nn_util.h"
|
||||||
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
|
||||||
|
__BEGIN_DECLS
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Define kernel meta.
|
||||||
|
*/
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
INTERNAL_KERNEL_ROPE,
|
||||||
|
} _internal_kernel_e;
|
||||||
|
|
||||||
|
#define _ROPE_KERNEL_SOURCE "rope"
|
||||||
|
#define _ROPE_KERNEL_NAME CVIVANTE_NAMESPACE("cl.rope")
|
||||||
|
|
||||||
|
// Add kernel hashtable here
|
||||||
|
#define STR(a) #a
|
||||||
|
#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
|
||||||
|
((IN0_DTYPE) | (IN0_DTYPE << 8) | (OUT_DTYPE << 16) | (AXIS << 25))
|
||||||
|
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
|
||||||
|
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ), \
|
||||||
|
CVIVANTE_NAMESPACE("cl.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_axis"STR(AXIS)), \
|
||||||
|
"rope_0" }
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
uint32_t key;
|
||||||
|
char * function_name;
|
||||||
|
const char * source_name;
|
||||||
|
} _kernel_map_type;
|
||||||
|
|
||||||
|
static const _kernel_map_type _rope_kernel_map[] =
|
||||||
|
{
|
||||||
|
// Register kernel here
|
||||||
|
PACK_KERNEL_MAP( F32, F32, F32, 0 ),
|
||||||
|
PACK_KERNEL_MAP( F32, F32, F32, 1 ),
|
||||||
|
PACK_KERNEL_MAP( F32, F32, F32, 2 ),
|
||||||
|
PACK_KERNEL_MAP( I32, I32, I32, 0 ),
|
||||||
|
PACK_KERNEL_MAP( I32, I32, I32, 1 ),
|
||||||
|
PACK_KERNEL_MAP( I32, I32, I32, 2 ),
|
||||||
|
PACK_KERNEL_MAP( U32, U32, U32, 0 ),
|
||||||
|
PACK_KERNEL_MAP( U32, U32, U32, 1 ),
|
||||||
|
PACK_KERNEL_MAP( U32, U32, U32, 2 ),
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Kernel params
|
||||||
|
*/
|
||||||
|
static vx_param_description_t _rope_kernel_param_def[] =
|
||||||
|
{
|
||||||
|
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
};
|
||||||
|
#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def )
|
||||||
|
#define SCALAR_AXIS (4)
|
||||||
|
#define SCALAR_IN_ZP (5)
|
||||||
|
#define SCALAR_COS_ZP (6)
|
||||||
|
#define SCALAR_SIN_ZP (7)
|
||||||
|
#define SCALAR_SCALE0 (8)
|
||||||
|
#define SCALAR_SCALE1 (9)
|
||||||
|
#define SCALAR_OUT_ZP (10)
|
||||||
|
#define SCALAR_HALF_HEAD_SIZE (11)
|
||||||
|
#define SCALAR_STEP (12)
|
||||||
|
/*
|
||||||
|
* Kernel initializer
|
||||||
|
*/
|
||||||
|
DEF_KERNEL_INITIALIZER(_rope_initializer)
|
||||||
|
(
|
||||||
|
vsi_nn_kernel_node_t node,
|
||||||
|
const vsi_nn_kernel_node_param_t * param,
|
||||||
|
size_t param_size
|
||||||
|
)
|
||||||
|
{
|
||||||
|
gpu_param_t gpu_param = {
|
||||||
|
3, // workdim
|
||||||
|
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||||
|
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||||
|
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||||
|
{0, 0, 0} // globalWorkSize: image size in thread
|
||||||
|
};
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
vsi_nn_kernel_tensor_attr_t* attr[2] = { NULL };
|
||||||
|
int32_t axis = 0;
|
||||||
|
vsi_size_array_t* out_shape = NULL;
|
||||||
|
vsi_size_t shape[3] = { 1 };
|
||||||
|
|
||||||
|
VSI_UNREFERENCED(node);
|
||||||
|
VSI_UNREFERENCED(param_size);
|
||||||
|
|
||||||
|
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||||
|
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||||
|
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||||
|
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
|
||||||
|
out_shape = attr[1]->shape;
|
||||||
|
shape[0] = out_shape->data[0];
|
||||||
|
shape[1] = out_shape->data[1];
|
||||||
|
shape[2] = out_shape->data[2];
|
||||||
|
shape[axis] = shape[axis] / 2;
|
||||||
|
|
||||||
|
gpu_param.global_scale[0] = 1;
|
||||||
|
gpu_param.global_scale[1] = 1;
|
||||||
|
gpu_param.global_scale[2] = 1;
|
||||||
|
gpu_param.global_size[0] = shape[0];
|
||||||
|
gpu_param.global_size[1] = shape[1];
|
||||||
|
gpu_param.global_size[2] = out_shape->size > 2 ? shape[2] : 1;
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
|
||||||
|
|
||||||
|
final:
|
||||||
|
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||||
|
SAFE_FREE_TENSOR_ATTR(attr[0]);
|
||||||
|
SAFE_FREE_TENSOR_ATTR(attr[1]);
|
||||||
|
#undef SAFE_FREE_TENSOR_ATTR
|
||||||
|
|
||||||
|
return status;
|
||||||
|
} /* _rope_initializer() */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Query kernel
|
||||||
|
*/
|
||||||
|
static vsi_status _query_kernel
|
||||||
|
(
|
||||||
|
vsi_nn_kernel_t * kernel,
|
||||||
|
vsi_nn_tensor_t * const * const inputs,
|
||||||
|
vsi_nn_tensor_t * const * const outputs,
|
||||||
|
int32_t axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
vsi_nn_kernel_dtype_e in0_dtype;
|
||||||
|
vsi_nn_kernel_dtype_e in1_dtype;
|
||||||
|
vsi_nn_kernel_dtype_e in2_dtype;
|
||||||
|
vsi_nn_kernel_dtype_e out_dtype;
|
||||||
|
const _kernel_map_type * kernel_map = _rope_kernel_map;
|
||||||
|
size_t kernel_map_size = _cnt_of_array( _rope_kernel_map );
|
||||||
|
vx_param_description_t * param_def = _rope_kernel_param_def;
|
||||||
|
vx_kernel_initialize_f initializer = _rope_initializer;
|
||||||
|
|
||||||
|
uint32_t key = 0;
|
||||||
|
uint32_t i;
|
||||||
|
|
||||||
|
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||||
|
in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
|
||||||
|
in2_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
|
||||||
|
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||||
|
|
||||||
|
#define _PACK_SELECT_KEY( in0_type, in1_type, in2_type, out_type ) \
|
||||||
|
((in0_type) | (in1_type << 8) | (in2_type << 16) | (out_type << 24))
|
||||||
|
switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype))
|
||||||
|
{
|
||||||
|
case _PACK_SELECT_KEY(F32, F32, F32, F32):
|
||||||
|
case _PACK_SELECT_KEY(F16, F16, F16, F16):
|
||||||
|
key = ROPE_HASH_KEY(F32, F32, F32, axis);
|
||||||
|
break;
|
||||||
|
case _PACK_SELECT_KEY(U8, U8, U8, U8):
|
||||||
|
case _PACK_SELECT_KEY(U16, U16, U16, U16):
|
||||||
|
key = ROPE_HASH_KEY(U32, U32, U32, axis);
|
||||||
|
break;
|
||||||
|
case _PACK_SELECT_KEY(I8, I8, I8, I8):
|
||||||
|
case _PACK_SELECT_KEY(I16, I16, I16, I16):
|
||||||
|
case _PACK_SELECT_KEY(I32, I32, I32, I32):
|
||||||
|
key = ROPE_HASH_KEY(I32, I32, I32, axis);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#undef _PACK_SELECT_KEY
|
||||||
|
|
||||||
|
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||||
|
{
|
||||||
|
if ( kernel_map[i].key == key )
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( i < (uint32_t)kernel_map_size )
|
||||||
|
{
|
||||||
|
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||||
|
kernel->info.parameters = param_def;
|
||||||
|
kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def );
|
||||||
|
kernel->info.initialize = initializer;
|
||||||
|
// Register code source
|
||||||
|
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||||
|
"eltwise_ops_helper",
|
||||||
|
kernel_map[i].source_name );
|
||||||
|
// Register binary source
|
||||||
|
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||||
|
kernel_map[i].source_name );
|
||||||
|
status = VSI_SUCCESS;
|
||||||
|
}
|
||||||
|
return status;
|
||||||
|
} /* _query_kernel() */
|
||||||
|
|
||||||
|
|
||||||
|
static vsi_nn_kernel_node_t _setup
|
||||||
|
(
|
||||||
|
vsi_nn_graph_t * graph,
|
||||||
|
vsi_nn_tensor_t ** inputs,
|
||||||
|
size_t input_num,
|
||||||
|
vsi_nn_tensor_t ** outputs,
|
||||||
|
size_t output_num,
|
||||||
|
const vsi_nn_kernel_param_t * params,
|
||||||
|
vsi_nn_kernel_t * kernel
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = {NULL};
|
||||||
|
vsi_nn_kernel_node_t node = NULL;
|
||||||
|
int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||||
|
int32_t interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
|
||||||
|
float in_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||||
|
float cos_scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||||
|
float sin_scale = vsi_nn_get_tensor_scale(inputs[2]);
|
||||||
|
float out_scale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||||
|
float in_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||||
|
float cos_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
|
||||||
|
float sin_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
|
||||||
|
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||||
|
int32_t half_head_size = interleaved ? 1 : (int32_t)(inputs[0]->attr.size[axis] / 2);
|
||||||
|
float scale0 = in_scale * cos_scale / out_scale;
|
||||||
|
float scale1 = in_scale * sin_scale / out_scale;
|
||||||
|
int32_t step = interleaved ? 2 : 1;
|
||||||
|
int32_t i = 0;
|
||||||
|
|
||||||
|
// Check if gpu can support the size
|
||||||
|
if ( !vsi_nn_kernel_gpu_check_shape(
|
||||||
|
inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
status = _query_kernel( kernel, inputs, outputs, axis );
|
||||||
|
if (VSI_SUCCESS == status)
|
||||||
|
{
|
||||||
|
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||||
|
if ( node )
|
||||||
|
{
|
||||||
|
/* Set inputs and outputs */
|
||||||
|
vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
|
||||||
|
inputs, input_num, outputs, output_num );
|
||||||
|
/* Pass parameters to node. */
|
||||||
|
node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
|
||||||
|
graph, I32, &axis);
|
||||||
|
node_params[SCALAR_IN_ZP] = vsi_nn_kernel_scalar_create(
|
||||||
|
graph, F32, &in_zp);
|
||||||
|
node_params[SCALAR_COS_ZP] = vsi_nn_kernel_scalar_create(
|
||||||
|
graph, F32, &cos_zp);
|
||||||
|
node_params[SCALAR_SIN_ZP] = vsi_nn_kernel_scalar_create(
|
||||||
|
graph, F32, &sin_zp);
|
||||||
|
node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create(
|
||||||
|
graph, F32, &scale0);
|
||||||
|
node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create(
|
||||||
|
graph, F32, &scale1);
|
||||||
|
node_params[SCALAR_OUT_ZP] = vsi_nn_kernel_scalar_create(
|
||||||
|
graph, F32, &output_zp);
|
||||||
|
node_params[SCALAR_HALF_HEAD_SIZE] = vsi_nn_kernel_scalar_create(
|
||||||
|
graph, I32, &half_head_size);
|
||||||
|
node_params[SCALAR_STEP] = vsi_nn_kernel_scalar_create(
|
||||||
|
graph, I32, &step);
|
||||||
|
status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = SCALAR_AXIS; i < (int32_t)_ROPE_PARAM_NUM; i++)
|
||||||
|
{
|
||||||
|
if (node_params[i])
|
||||||
|
{
|
||||||
|
vsi_nn_kernel_scalar_release(&node_params[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return node;
|
||||||
|
} /* _setup() */
|
||||||
|
|
||||||
|
__END_DECLS
|
||||||
|
|
||||||
|
REGISTER_BACKEND_CL( rope, _setup )
|
||||||
|
|
||||||
|
|
@ -27,6 +27,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_tensor.h"
|
#include "vsi_nn_tensor.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
|
|
@ -299,7 +300,7 @@ static vsi_nn_kernel_node_t _setup
|
||||||
VSI_UNREFERENCED(output_num);
|
VSI_UNREFERENCED(output_num);
|
||||||
|
|
||||||
#if (VX_ACTIVATION_EXT_SUPPORT)
|
#if (VX_ACTIVATION_EXT_SUPPORT)
|
||||||
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
|
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
|
||||||
{
|
{
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_tensor.h"
|
#include "vsi_nn_tensor.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
|
|
@ -457,7 +458,7 @@ static vsi_nn_kernel_node_t _setup
|
||||||
vsi_bool is_odd_even_sort = FALSE;
|
vsi_bool is_odd_even_sort = FALSE;
|
||||||
vsi_bool is_bitnoic_segment = FALSE;
|
vsi_bool is_bitnoic_segment = FALSE;
|
||||||
size_t param_num = _TOPK_PARAM_NUM;
|
size_t param_num = _TOPK_PARAM_NUM;
|
||||||
int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
|
int32_t max_stages = 7 + (int32_t)log2(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >> 2);
|
||||||
vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||||
vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||||
|
|
||||||
|
|
@ -483,6 +484,11 @@ static vsi_nn_kernel_node_t _setup
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (block_size >= GPU_TENSOR_MAX_WIDTH)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
shape[0][0] = block_size;
|
shape[0][0] = block_size;
|
||||||
shape[0][1] = block_num;
|
shape[0][1] = block_num;
|
||||||
shape[1][0] = top_k;
|
shape[1][0] = top_k;
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_tensor.h"
|
#include "vsi_nn_tensor.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
|
|
@ -192,7 +193,7 @@ static vsi_bool _bucketize_support_types
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2)
|
if (in_dtype == F16 && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver != VSI_NN_HW_EVIS_2)
|
||||||
{
|
{
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_tensor.h"
|
#include "vsi_nn_tensor.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
|
|
@ -771,7 +772,8 @@ static vsi_nn_kernel_node_t _setup
|
||||||
temp_tensor[1] = weights;
|
temp_tensor[1] = weights;
|
||||||
temp_tensor[2] = biases;
|
temp_tensor[2] = biases;
|
||||||
|
|
||||||
ks = get_kernel_size(weights->attr.size[0], dilation, stride, graph->ctx->config.evis.ver);
|
ks = get_kernel_size(weights->attr.size[0], dilation, stride,
|
||||||
|
((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver);
|
||||||
|
|
||||||
status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);
|
status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -121,7 +121,9 @@ static const _kernel_map_type _groupnorm_sums_kernel_map[] =
|
||||||
TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 )
|
TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 )
|
||||||
TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 )
|
TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 )
|
||||||
TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 )
|
TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 )
|
||||||
|
TENSOR_GROUPNORM_SUMS_KERNELS( U16, F32, KERNEL_SOURCE_2 )
|
||||||
TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 )
|
TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 )
|
||||||
|
TENSOR_GROUPNORM_SUMS_KERNELS_2D( U16, F32, KERNEL_SOURCE_2 )
|
||||||
TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 )
|
TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 )
|
||||||
TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
|
TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
|
||||||
};
|
};
|
||||||
|
|
@ -174,6 +176,9 @@ static const _kernel_map_type _groupnorm_kernel_map[] =
|
||||||
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 )
|
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 )
|
||||||
TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 )
|
TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 )
|
||||||
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 )
|
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 )
|
||||||
|
|
||||||
|
TENSOR_GROUPNORM_SCALE_KERNELS( U16, F32, U16, KERNEL_SOURCE_2 )
|
||||||
|
TENSOR_GROUPNORM_SCALE_KERNELS_2D( U16, F32, U16, KERNEL_SOURCE_2 )
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
@ -245,6 +250,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
|
||||||
float sum_x2_tail0 = 1;
|
float sum_x2_tail0 = 1;
|
||||||
float sum_x2_tail1 = 1;
|
float sum_x2_tail1 = 1;
|
||||||
float work_item_pixels = 1;
|
float work_item_pixels = 1;
|
||||||
|
vsi_bool is_input_8bits = FALSE;
|
||||||
|
|
||||||
VSI_UNREFERENCED(param_size);
|
VSI_UNREFERENCED(param_size);
|
||||||
|
|
||||||
|
|
@ -263,12 +269,13 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
|
||||||
width = (int32_t)(input_shape->data[0]);
|
width = (int32_t)(input_shape->data[0]);
|
||||||
height = (int32_t)(input_shape->data[1]);
|
height = (int32_t)(input_shape->data[1]);
|
||||||
chn = (int32_t)(attr[1]->shape->data[1]);
|
chn = (int32_t)(attr[1]->shape->data[1]);
|
||||||
|
is_input_8bits = attr[0]->dtype == I8 || attr[0]->dtype == U8;
|
||||||
if (is2D)
|
if (is2D)
|
||||||
{
|
{
|
||||||
height = 1;
|
height = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
work_item_pixels = (float)height * 16;
|
work_item_pixels = is_input_8bits ? 16 * (float)height : 8 * (float)height;
|
||||||
|
|
||||||
sum_x_tail = -work_item_pixels * input_zp * input_scale;
|
sum_x_tail = -work_item_pixels * input_zp * input_scale;
|
||||||
sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
|
sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
|
||||||
|
|
@ -281,11 +288,11 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
|
||||||
shaderParam.local_size[1] = 1;
|
shaderParam.local_size[1] = 1;
|
||||||
shaderParam.local_size[2] = 1;
|
shaderParam.local_size[2] = 1;
|
||||||
|
|
||||||
if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
|
if (is_input_8bits)
|
||||||
{
|
{
|
||||||
shaderParam.global_size[0] = (width + 255) / 256 * 16;
|
shaderParam.global_size[0] = (width + 255) / 256 * 16;
|
||||||
}
|
}
|
||||||
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
|
||||||
{
|
{
|
||||||
shaderParam.global_size[0] = (width + 127) / 128 * 16;
|
shaderParam.global_size[0] = (width + 127) / 128 * 16;
|
||||||
}
|
}
|
||||||
|
|
@ -324,7 +331,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
|
||||||
status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
|
status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
|
||||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||||
}
|
}
|
||||||
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
|
||||||
{
|
{
|
||||||
gpu_dp_inst_t uniSum_X_X2_8x2 = {{
|
gpu_dp_inst_t uniSum_X_X2_8x2 = {{
|
||||||
0x55555555, // TCfg
|
0x55555555, // TCfg
|
||||||
|
|
@ -483,7 +490,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
|
||||||
}
|
}
|
||||||
|
|
||||||
shaderParam.global_scale[0] = 16;
|
shaderParam.global_scale[0] = 16;
|
||||||
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
|
||||||
{
|
{
|
||||||
shaderParam.global_scale[0] = 8;
|
shaderParam.global_scale[0] = 8;
|
||||||
}
|
}
|
||||||
|
|
@ -610,6 +617,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
|
||||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case _PACK_SELECT_KEY( U16, U16 ):
|
||||||
case _PACK_SELECT_KEY( I16, I16 ):
|
case _PACK_SELECT_KEY( I16, I16 ):
|
||||||
case _PACK_SELECT_KEY( I16, F16 ):
|
case _PACK_SELECT_KEY( I16, F16 ):
|
||||||
case _PACK_SELECT_KEY( F16, F16 ):
|
case _PACK_SELECT_KEY( F16, F16 ):
|
||||||
|
|
@ -838,8 +846,7 @@ static vsi_nn_kernel_node_t _setup
|
||||||
attr.is_const = FALSE;
|
attr.is_const = FALSE;
|
||||||
attr.vtl = TRUE;
|
attr.vtl = TRUE;
|
||||||
attr.size[0] = ((new_shape[0] + 255) / 256) * 4;
|
attr.size[0] = ((new_shape[0] + 255) / 256) * 4;
|
||||||
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|
if (in0_dtype == I16 || in0_dtype == F16 || in0_dtype == U16)
|
||||||
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
|
|
||||||
{
|
{
|
||||||
attr.size[0] = ((new_shape[0] + 127) / 128) * 4;
|
attr.size[0] = ((new_shape[0] + 127) / 128) * 4;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -124,22 +124,23 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
||||||
{0, 0, 0}
|
{0, 0, 0}
|
||||||
};
|
};
|
||||||
int8_t in0_fl = 0;
|
int8_t in0_fl = 0;
|
||||||
int32_t inputZP0 = 0;
|
int32_t input0_zp = 0;
|
||||||
float input_scale0 = 1.0f;
|
float input0_scale = 1.0f;
|
||||||
int32_t inputZP1 = 0;
|
int32_t input1_zp = 0;
|
||||||
float input_scale1 = 1.0f;
|
float input1_scale = 1.0f;
|
||||||
|
float output_zp = 0;
|
||||||
int8_t out_fl = 0;
|
int8_t out_fl = 0;
|
||||||
float outputZP = 0;
|
|
||||||
|
|
||||||
int32_t shift0 = 0;
|
int32_t shift0 = 0;
|
||||||
vsi_bool is_ge_fl = FALSE;
|
vsi_bool is_ge_fl = FALSE;
|
||||||
|
|
||||||
vsi_bool is_2d_img = FALSE;
|
vsi_bool is_2d_img = FALSE;
|
||||||
uint32_t evis_version = 0;
|
uint32_t evis_version = 0;
|
||||||
|
|
||||||
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
|
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
|
||||||
vsi_size_array_t * out_shape = NULL;
|
vsi_size_array_t * out_shape = NULL;
|
||||||
uint32_t pack_key;
|
uint32_t pack_key;
|
||||||
vx_context ctx = vxGetContext((vx_reference)node);
|
vx_context ctx = vxGetContext((vx_reference)node);
|
||||||
vx_hardware_caps_params_t hw_param;
|
vx_hardware_caps_params_t hw_param;
|
||||||
|
|
||||||
VSI_UNREFERENCED(param_size);
|
VSI_UNREFERENCED(param_size);
|
||||||
|
|
@ -165,34 +166,30 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
||||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||||
|
|
||||||
out_shape = attr[2]->shape;
|
out_shape = attr[2]->shape;
|
||||||
inputZP0 = attr[0]->zero_point;
|
input0_zp = attr[0]->zero_point;
|
||||||
input_scale0 = attr[0]->scale;
|
input0_scale = attr[0]->scale;
|
||||||
inputZP1 = attr[1]->zero_point;
|
input1_zp = attr[1]->zero_point;
|
||||||
input_scale1 = attr[1]->scale;
|
input1_scale = attr[1]->scale;
|
||||||
outputZP = (float)attr[2]->zero_point;
|
output_zp = (float)attr[2]->zero_point;
|
||||||
input_scale0 = input_scale0 / attr[2]->scale;
|
input0_scale = input0_scale / attr[2]->scale;
|
||||||
|
|
||||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP &&
|
||||||
|
attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||||
{
|
{
|
||||||
in0_fl = (int8_t)attr[0]->dfp.fl;
|
in0_fl = (int8_t)attr[0]->dfp.fl;
|
||||||
}
|
|
||||||
|
|
||||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
|
||||||
{
|
|
||||||
out_fl = (int8_t)attr[2]->dfp.fl;
|
out_fl = (int8_t)attr[2]->dfp.fl;
|
||||||
|
shift0 = in0_fl - out_fl;
|
||||||
|
is_ge_fl = shift0 >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
shift0 = in0_fl - out_fl;
|
|
||||||
|
|
||||||
is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
|
is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
|
||||||
is_ge_fl = shift0 >= 0;
|
|
||||||
|
|
||||||
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 ) \
|
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 ) \
|
||||||
(IN0_TYPE | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26))
|
(IN0_TYPE | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26))
|
||||||
|
|
||||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version );
|
pack_key = _PACK_SELECT_KEY(attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version);
|
||||||
|
|
||||||
if ( attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
|
if (attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
|
||||||
{
|
{
|
||||||
gpu_param.global_scale[0] = 16;
|
gpu_param.global_scale[0] = 16;
|
||||||
gpu_param.global_scale[1] = 1;
|
gpu_param.global_scale[1] = 1;
|
||||||
|
|
@ -204,7 +201,6 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
||||||
gpu_param.global_scale[1] = 1;
|
gpu_param.global_scale[1] = 1;
|
||||||
gpu_param.global_scale[2] = 1;
|
gpu_param.global_scale[2] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpu_param.global_size[0] = gpu_align_p2(
|
gpu_param.global_size[0] = gpu_align_p2(
|
||||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||||
/ gpu_param.global_scale[0], 4);
|
/ gpu_param.global_scale[0], 4);
|
||||||
|
|
@ -215,97 +211,97 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
||||||
|
|
||||||
switch( pack_key )
|
switch( pack_key )
|
||||||
{
|
{
|
||||||
case _PACK_SELECT_KEY( I8, I8, 1, 1, 2 ):
|
case _PACK_SELECT_KEY(I8, I8, 1, 1, 2):
|
||||||
case _PACK_SELECT_KEY( I16, I16, 1, 1, 2 ):
|
case _PACK_SELECT_KEY(I16, I16, 1, 1, 2):
|
||||||
|
{
|
||||||
|
gpu_dp_inst_t uniPreluDFPLo_2x8b = { {
|
||||||
|
0x77777777, // TCfg
|
||||||
|
0x44444444, // ASelt
|
||||||
|
0x33221100, 0x77665544, // ABin
|
||||||
|
0x00000000, // BSelt
|
||||||
|
0x30201000, 0x70605040, // BBin
|
||||||
|
0x00004000, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniPreluDFPHi_2x8b = { {
|
||||||
|
0x77777777, // TCfg
|
||||||
|
0x44444444, // ASelt
|
||||||
|
0xbbaa9988, 0xffeeddcc, // ABin
|
||||||
|
0x00000000, // BSelt
|
||||||
|
0x30201000, 0x70605040, // BBin
|
||||||
|
0x00004000, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
|
||||||
|
if (attr[0]->dtype == I16)
|
||||||
{
|
{
|
||||||
gpu_dp_inst_t uniPreluDFPLo_2x8b = {{
|
uniPreluDFPLo_2x8b.data[7] = 0x00003000;
|
||||||
0x77777777, // TCfg
|
uniPreluDFPHi_2x8b.data[7] = 0x00003000;
|
||||||
0x44444444, // ASelt
|
|
||||||
0x33221100, 0x77665544, // ABin
|
|
||||||
0x00000000, // BSelt
|
|
||||||
0x30201000, 0x70605040, // BBin
|
|
||||||
0x00004000, // AccumType, ConstantType, and PostShift
|
|
||||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
|
||||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
|
||||||
}, GPU_DP_TYPE_16 };
|
|
||||||
gpu_dp_inst_t uniPreluDFPHi_2x8b = {{
|
|
||||||
0x77777777, // TCfg
|
|
||||||
0x44444444, // ASelt
|
|
||||||
0xbbaa9988, 0xffeeddcc, // ABin
|
|
||||||
0x00000000, // BSelt
|
|
||||||
0x30201000, 0x70605040, // BBin
|
|
||||||
0x00004000, // AccumType, ConstantType, and PostShift
|
|
||||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
|
||||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
|
||||||
}, GPU_DP_TYPE_16 };
|
|
||||||
|
|
||||||
if ( attr[0]->dtype == I16 )
|
|
||||||
{
|
|
||||||
uniPreluDFPLo_2x8b.data[7] = 0x00003000;
|
|
||||||
uniPreluDFPHi_2x8b.data[7] = 0x00003000;
|
|
||||||
}
|
|
||||||
|
|
||||||
gpu_dp_inst_update_postshfit( &uniPreluDFPLo_2x8b, shift0 );
|
|
||||||
gpu_dp_inst_update_postshfit( &uniPreluDFPHi_2x8b, shift0 );
|
|
||||||
|
|
||||||
status = vsi_nn_kernel_gpu_add_param( node,
|
|
||||||
"uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b );
|
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
|
||||||
"uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b );
|
|
||||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
case _PACK_SELECT_KEY( I8, I8, 1, 1, 1 ):
|
|
||||||
case _PACK_SELECT_KEY( I16, I16, 1, 1, 1 ):
|
|
||||||
{
|
|
||||||
gpu_dp_inst_t uniPreluInt8_2x8 = {{
|
|
||||||
0x55555555, // TCfg
|
|
||||||
0x00000000, // ASelt
|
|
||||||
0xb3a29180, 0xf7e6d5c4, // ABin
|
|
||||||
0x66666666, // BSelt
|
|
||||||
0x30201000, 0x70605040, // BBin
|
|
||||||
0x00000600, // AccumType, ConstantType, and PostShift
|
|
||||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
|
||||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
|
||||||
}, GPU_DP_TYPE_16 };
|
|
||||||
gpu_dp_inst_t uniPreluInt16_part0_4x4 = {{
|
|
||||||
0x05050505, // TCfg
|
|
||||||
0x00000000, // ASelt
|
|
||||||
0x00510040, 0x00730062, // ABin
|
|
||||||
0x06060606, // BSelt
|
|
||||||
0x00100000, 0x00300020, // BBin
|
|
||||||
0x00000400, // AccumType, ConstantType, and PostShift
|
|
||||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
|
||||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
|
||||||
}, GPU_DP_TYPE_16 };
|
|
||||||
gpu_dp_inst_t uniPreluInt16_part1_4x4 = {{
|
|
||||||
0x05050505, // TCfg
|
|
||||||
0x00000000, // ASelt
|
|
||||||
0x00510040, 0x00730062, // ABin
|
|
||||||
0x06060606, // BSelt
|
|
||||||
0x00500040, 0x00700060, // BBin
|
|
||||||
0x00000400, // AccumType, ConstantType, and PostShift
|
|
||||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
|
||||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
|
||||||
}, GPU_DP_TYPE_16 };
|
|
||||||
|
|
||||||
gpu_dp_inst_update_postshfit( &uniPreluInt8_2x8, shift0 );
|
gpu_dp_inst_update_postshfit(&uniPreluDFPLo_2x8b, shift0);
|
||||||
gpu_dp_inst_update_postshfit( &uniPreluInt16_part0_4x4, shift0 );
|
gpu_dp_inst_update_postshfit(&uniPreluDFPHi_2x8b, shift0);
|
||||||
gpu_dp_inst_update_postshfit( &uniPreluInt16_part1_4x4, shift0 );
|
|
||||||
|
|
||||||
status = vsi_nn_kernel_gpu_add_param( node,
|
status = vsi_nn_kernel_gpu_add_param(node,
|
||||||
"uniPreluInt8_2x8", &uniPreluInt8_2x8 );
|
"uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
"uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4 );
|
"uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
"uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4 );
|
}
|
||||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
break;
|
||||||
}
|
case _PACK_SELECT_KEY(I8, I8, 1, 1, 1):
|
||||||
break;
|
case _PACK_SELECT_KEY(I16, I16, 1, 1, 1):
|
||||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 1 ):
|
{
|
||||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 2 ):
|
gpu_dp_inst_t uniPreluInt8_2x8 = { {
|
||||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 1 ):
|
0x55555555, // TCfg
|
||||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 2 ):
|
0x00000000, // ASelt
|
||||||
|
0xb3a29180, 0xf7e6d5c4, // ABin
|
||||||
|
0x66666666, // BSelt
|
||||||
|
0x30201000, 0x70605040, // BBin
|
||||||
|
0x00000600, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniPreluInt16_part0_4x4 = { {
|
||||||
|
0x05050505, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00510040, 0x00730062, // ABin
|
||||||
|
0x06060606, // BSelt
|
||||||
|
0x00100000, 0x00300020, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniPreluInt16_part1_4x4 = { {
|
||||||
|
0x05050505, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00510040, 0x00730062, // ABin
|
||||||
|
0x06060606, // BSelt
|
||||||
|
0x00500040, 0x00700060, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
|
||||||
|
gpu_dp_inst_update_postshfit(&uniPreluInt8_2x8, shift0);
|
||||||
|
gpu_dp_inst_update_postshfit(&uniPreluInt16_part0_4x4, shift0);
|
||||||
|
gpu_dp_inst_update_postshfit(&uniPreluInt16_part1_4x4, shift0);
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniPreluInt8_2x8", &uniPreluInt8_2x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 1):
|
||||||
|
case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 2):
|
||||||
|
case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 1):
|
||||||
|
case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 2):
|
||||||
{
|
{
|
||||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||||
0x11111111, // TCfg
|
0x11111111, // TCfg
|
||||||
|
|
@ -446,15 +442,15 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||||
"uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 );
|
"uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 );
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||||
"inputZP0", &inputZP0 );
|
"input0_zp", &input0_zp);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||||
"input_scale0", &input_scale0 );
|
"input0_scale", &input0_scale );
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||||
"inputZP1", &inputZP1 );
|
"input1_zp", &input1_zp);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||||
"input_scale1", &input_scale1 );
|
"input1_scale", &input1_scale );
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||||
"outputZP", &outputZP );
|
"output_zp", &output_zp );
|
||||||
if (attr[2]->dtype == F16)
|
if (attr[2]->dtype == F16)
|
||||||
{
|
{
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_tensor.h"
|
#include "vsi_nn_tensor.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
|
|
@ -58,53 +59,92 @@ typedef enum
|
||||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt"
|
#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt"
|
||||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_1"
|
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_1"
|
||||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_2"
|
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_2"
|
||||||
|
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_3"
|
||||||
|
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_4"
|
||||||
|
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_5"
|
||||||
|
|
||||||
#define STR(a) #a
|
#define STR(a) #a
|
||||||
// Add kernel hashtable here
|
// Add kernel hashtable here
|
||||||
#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \
|
#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag, same_type ) \
|
||||||
(( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag))
|
(( IN_DTYPE ) | ( OUT_DTYPE << 8) | (scale_flag << 16) | (same_type << 22))
|
||||||
|
|
||||||
#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
|
#define _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
|
||||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN, SAME_TYPE ), \
|
||||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \
|
||||||
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
|
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
|
||||||
|
|
||||||
#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
|
#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
|
||||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \
|
_PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, TRUE ), \
|
||||||
|
_PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, FALSE )
|
||||||
|
|
||||||
|
#define _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
|
||||||
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP, SAME_TYPE ), \
|
||||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \
|
||||||
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
|
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
|
||||||
|
|
||||||
#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
|
#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
|
||||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \
|
_PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, TRUE ), \
|
||||||
|
_PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, FALSE )
|
||||||
|
|
||||||
|
#define _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
|
||||||
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT, SAME_TYPE ), \
|
||||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \
|
||||||
_RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) }
|
_RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) }
|
||||||
|
|
||||||
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
|
||||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
|
_PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, TRUE ), \
|
||||||
|
_PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, FALSE )
|
||||||
|
|
||||||
|
#define PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, TRUE ), \
|
||||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||||
"_SAME_2x_upsample_half_pixel_centers"), \
|
"_SAME_2x_upsample_half_pixel_centers"), \
|
||||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
||||||
|
|
||||||
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
#define PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
|
||||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, TRUE ), \
|
||||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||||
"_SAME_4x_upsample_half_pixel_centers"), \
|
"_SAME_4x_upsample_half_pixel_centers"), \
|
||||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
||||||
|
|
||||||
#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
#define PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
|
||||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, TRUE ), \
|
||||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||||
"_SAME_8x_upsample_half_pixel_centers"), \
|
"_SAME_8x_upsample_half_pixel_centers"), \
|
||||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }
|
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }
|
||||||
|
|
||||||
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
#define PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
|
||||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, TRUE ), \
|
||||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||||
"_SAME_3x_upsample_half_pixel_centers"), \
|
"_SAME_3x_upsample_half_pixel_centers"), \
|
||||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
||||||
|
|
||||||
|
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, FALSE ), \
|
||||||
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||||
|
"_2x_upsample_half_pixel_centers"), \
|
||||||
|
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
|
||||||
|
|
||||||
|
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, FALSE ), \
|
||||||
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||||
|
"_4x_upsample_half_pixel_centers"), \
|
||||||
|
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
|
||||||
|
|
||||||
|
#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, FALSE ), \
|
||||||
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||||
|
"_8x_upsample_half_pixel_centers"), \
|
||||||
|
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(IN_DTYPE) }
|
||||||
|
|
||||||
|
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, FALSE ), \
|
||||||
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||||
|
"_3x_upsample_half_pixel_centers"), \
|
||||||
|
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(IN_DTYPE) }
|
||||||
|
|
||||||
#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
|
#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
|
||||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
|
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN, TRUE ), \
|
||||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||||
"_SAME_8x_upsample_align_corners"), \
|
"_SAME_8x_upsample_align_corners"), \
|
||||||
"resize_bilinear_align_corners" }
|
"resize_bilinear_align_corners" }
|
||||||
|
|
@ -135,6 +175,10 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
|
||||||
PACK_KERNEL_MAP_UP(F16, F16),
|
PACK_KERNEL_MAP_UP(F16, F16),
|
||||||
PACK_KERNEL_MAP_UP(BF16, BF16),
|
PACK_KERNEL_MAP_UP(BF16, BF16),
|
||||||
PACK_KERNEL_MAP_UP_OPT(U8, U8),
|
PACK_KERNEL_MAP_UP_OPT(U8, U8),
|
||||||
|
PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE(U8, U8),
|
||||||
|
PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE(U8, U8),
|
||||||
|
PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE(U8, U8),
|
||||||
|
PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE(U8, U8),
|
||||||
PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
|
PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
|
||||||
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
|
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
|
||||||
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
|
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
|
||||||
|
|
@ -672,18 +716,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
|
||||||
};
|
};
|
||||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||||
vsi_size_array_t * out_shape = NULL;
|
vsi_size_array_t * out_shape = NULL;
|
||||||
vsi_size_array_t * in_shape = NULL;
|
vsi_size_array_t * in_shape = NULL;
|
||||||
vsi_nn_kernel_dtype_e input_dtype = F16;
|
vsi_nn_kernel_dtype_e input_dtype = F16;
|
||||||
|
vsi_nn_kernel_dtype_e output_dtype = F16;
|
||||||
uint32_t depth = 0;
|
uint32_t depth = 0;
|
||||||
uint32_t in_width = 0;
|
uint32_t in_width = 0;
|
||||||
uint32_t in_height = 0;
|
uint32_t in_height = 0;
|
||||||
uint32_t out_width = 0;
|
uint32_t out_width = 0;
|
||||||
uint32_t out_height = 0;
|
uint32_t out_height = 0;
|
||||||
|
vsi_bool is_same_type = FALSE;
|
||||||
vsi_bool is_2x_up_kernel = FALSE;
|
vsi_bool is_2x_up_kernel = FALSE;
|
||||||
vsi_bool is_3x_up_kernel = FALSE;
|
vsi_bool is_3x_up_kernel = FALSE;
|
||||||
vsi_bool is_4x_up_kernel = FALSE;
|
vsi_bool is_4x_up_kernel = FALSE;
|
||||||
vsi_bool is_8x_up_kernel = FALSE;
|
vsi_bool is_8x_up_kernel = FALSE;
|
||||||
|
float scale = 1.f;
|
||||||
|
int32_t input_zp = 0;
|
||||||
|
int32_t output_zp = 0;
|
||||||
|
|
||||||
VSI_UNREFERENCED(param_size);
|
VSI_UNREFERENCED(param_size);
|
||||||
|
|
||||||
|
|
@ -692,17 +741,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
|
||||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||||
|
|
||||||
out_shape = output_attr->shape;
|
out_shape = output_attr->shape;
|
||||||
in_shape = input_attr->shape;
|
in_shape = input_attr->shape;
|
||||||
input_dtype = input_attr->dtype;
|
input_dtype = input_attr->dtype;
|
||||||
|
output_dtype = output_attr->dtype;
|
||||||
|
|
||||||
in_width = (uint32_t)(in_shape->data[0]);
|
in_width = (uint32_t)(in_shape->data[0]);
|
||||||
in_height = (uint32_t)(in_shape->data[1]);
|
in_height = (uint32_t)(in_shape->data[1]);
|
||||||
depth = (uint32_t)(in_shape->data[2]);
|
depth = (uint32_t)(in_shape->data[2]);
|
||||||
out_width = (uint32_t)(out_shape->data[0]);
|
out_width = (uint32_t)(out_shape->data[0]);
|
||||||
out_height = (uint32_t)(out_shape->data[1]);
|
out_height = (uint32_t)(out_shape->data[1]);
|
||||||
|
scale = input_attr->scale;
|
||||||
|
input_zp = input_attr->zero_point;
|
||||||
|
scale /= output_attr->scale;
|
||||||
|
output_zp = output_attr->zero_point;
|
||||||
|
is_same_type = _is_same_quant(input_attr, output_attr);
|
||||||
|
|
||||||
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
|
if ((U8 == input_dtype) && (output_dtype == U8))
|
||||||
{
|
{
|
||||||
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
|
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
|
||||||
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
|
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
|
||||||
|
|
@ -728,206 +783,303 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
|
||||||
gpu_param.global_scale[2] = 1;
|
gpu_param.global_scale[2] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_2x_up_kernel)
|
if (is_2x_up_kernel || is_3x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
|
||||||
{
|
{
|
||||||
gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
|
uint16_t M0 = 0;
|
||||||
0x55555555, 0x55555555, // TCfg
|
int32_t postShift = 0;
|
||||||
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
|
uint32_t multAndoutZP[2] = { 0 };
|
||||||
0x00000704, // AccumType, ConstantType, and PostShift
|
gpu_dp_inst_t uniU8PostProcess_2x8 = { {
|
||||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
0xdddddddd, // TCfg
|
||||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
0x44444444, // ASelt
|
||||||
}, GPU_DP_TYPE_16};
|
0x13121110, 0x17161514, // ABin
|
||||||
gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
|
0x11111111, // BSelt
|
||||||
0x55555555, 0x55555555, // TCfg
|
0x00000000, 0x00000000, // BBin
|
||||||
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
|
0x00002600, // AccumType, ConstantType, and PostShift
|
||||||
0x00000704, // AccumType, ConstantType, and PostShift
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
}, GPU_DP_TYPE_16 };
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
|
|
||||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
|
if (is_2x_up_kernel)
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
|
{
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
gpu_dp_inst_t uniResize2xUp_0_4x8 = { {
|
||||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
0x55555555, 0x55555555, // TCfg
|
||||||
}
|
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
|
||||||
else if (is_3x_up_kernel)
|
0x00000704, // AccumType, ConstantType, and PostShift
|
||||||
{
|
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||||
gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
|
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||||
0x15515515, // TCfg
|
}, GPU_DP_TYPE_16 };
|
||||||
0x00000000, // ASelt
|
gpu_dp_inst_t uniResize2xUp_1_4x8 = { {
|
||||||
0x21210110, 0x03323202, // ABin
|
0x55555555, 0x55555555, // TCfg
|
||||||
0x2aa2aa2a, // BSelt
|
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
|
||||||
0x00000000, 0x00000000, // BBin
|
0x00000704, // AccumType, ConstantType, and PostShift
|
||||||
0x00000610, // AccumType, ConstantType, and PostShift
|
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
|
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||||
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
|
}, GPU_DP_TYPE_16 };
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
|
|
||||||
0x05155155, // TCfg
|
|
||||||
0x00000000, // ASelt
|
|
||||||
0x54044343, 0x00650554, // ABin
|
|
||||||
0x0a2aa2aa, // BSelt
|
|
||||||
0x00000000, 0x00000000, // BBin
|
|
||||||
0x00000610, // AccumType, ConstantType, and PostShift
|
|
||||||
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
|
|
||||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
|
|
||||||
0x55551155, // TCfg
|
|
||||||
0x50501050, // ASelt
|
|
||||||
0x01011010, 0x21212121, // ABin
|
|
||||||
0xaaaa22aa, // BSelt
|
|
||||||
0x00000000, 0x00000000, // BBin
|
|
||||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
|
||||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
|
||||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
|
|
||||||
0x11555511, // TCfg
|
|
||||||
0x10505010, // ASelt
|
|
||||||
0x32320202, 0x03033232, // ABin
|
|
||||||
0x22aaaa22, // BSelt
|
|
||||||
0x00000000, 0x00000000, // BBin
|
|
||||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
|
||||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
|
|
||||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
|
|
||||||
0x55115555, // TCfg
|
|
||||||
0x50105050, // ASelt
|
|
||||||
0x43434343, 0x54540404, // ABin
|
|
||||||
0xaa22aaaa, // BSelt
|
|
||||||
0x00000000, 0x00000000, // BBin
|
|
||||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
|
||||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
|
|
||||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
|
|
||||||
0x00551155, // TCfg
|
|
||||||
0x00501050, // ASelt
|
|
||||||
0x05055454, 0x00006565, // ABin
|
|
||||||
0x00aa22aa, // BSelt
|
|
||||||
0x00000000, 0x00000000, // BBin
|
|
||||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
|
||||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
|
||||||
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
|
|
||||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
|
if (!is_same_type)
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
|
{
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
|
float f2i_radio = 16.0f;
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
|
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
|
multAndoutZP[0] = (uint32_t)(M0);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
|
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
|
||||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
|
||||||
}
|
|
||||||
else if (is_4x_up_kernel)
|
|
||||||
{
|
|
||||||
gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
|
||||||
0x00000406, // AccumType, ConstantType, and PostShift
|
|
||||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
|
||||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
|
||||||
0x00000406, // AccumType, ConstantType, and PostShift
|
|
||||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
|
||||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
|
||||||
0x00000406, // AccumType, ConstantType, and PostShift
|
|
||||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
|
||||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
|
||||||
0x00000406, // AccumType, ConstantType, and PostShift
|
|
||||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
|
||||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
|
|
||||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
|
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
|
uniResize2xUp_0_4x8.data[7] = 0x00000700;
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
|
uniResize2xUp_1_4x8.data[7] = 0x00000700;
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
|
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
|
||||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
|
||||||
}
|
|
||||||
else if (is_8x_up_kernel)
|
|
||||||
{
|
|
||||||
gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
|
||||||
0x00000708, // AccumType, ConstantType, and PostShift
|
|
||||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
|
||||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
|
||||||
0x00000708, // AccumType, ConstantType, and PostShift
|
|
||||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
|
||||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
|
||||||
0x00000708, // AccumType, ConstantType, and PostShift
|
|
||||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
|
||||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
|
||||||
0x00000708, // AccumType, ConstantType, and PostShift
|
|
||||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
|
||||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
|
||||||
0x00000708, // AccumType, ConstantType, and PostShift
|
|
||||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
|
||||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
|
||||||
0x00000708, // AccumType, ConstantType, and PostShift
|
|
||||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
|
||||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
|
||||||
0x00000708, // AccumType, ConstantType, and PostShift
|
|
||||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
|
||||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
|
|
||||||
0x55555555, 0x55555555, // TCfg
|
|
||||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
|
||||||
0x00000708, // AccumType, ConstantType, and PostShift
|
|
||||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
|
||||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
|
||||||
}, GPU_DP_TYPE_16};
|
|
||||||
|
|
||||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
|
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
|
&uniU8PostProcess_2x8);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
|
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
|
}
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
|
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
|
status = vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
|
||||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
|
||||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
else if (is_3x_up_kernel)
|
||||||
|
{
|
||||||
|
gpu_dp_inst_t uniResize3xUp_l00_2x8 = { {
|
||||||
|
0x15515515, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x21210110, 0x03323202, // ABin
|
||||||
|
0x2aa2aa2a, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00000610, // AccumType, ConstantType, and PostShift
|
||||||
|
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
|
||||||
|
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize3xUp_l01_2x8 = { {
|
||||||
|
0x05155155, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x54044343, 0x00650554, // ABin
|
||||||
|
0x0a2aa2aa, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00000610, // AccumType, ConstantType, and PostShift
|
||||||
|
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
|
||||||
|
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize3xUp_l10_4x4 = { {
|
||||||
|
0x55551155, // TCfg
|
||||||
|
0x50501050, // ASelt
|
||||||
|
0x01011010, 0x21212121, // ABin
|
||||||
|
0xaaaa22aa, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||||
|
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||||
|
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize3xUp_l11_4x4 = { {
|
||||||
|
0x11555511, // TCfg
|
||||||
|
0x10505010, // ASelt
|
||||||
|
0x32320202, 0x03033232, // ABin
|
||||||
|
0x22aaaa22, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
|
||||||
|
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize3xUp_l12_4x4 = { {
|
||||||
|
0x55115555, // TCfg
|
||||||
|
0x50105050, // ASelt
|
||||||
|
0x43434343, 0x54540404, // ABin
|
||||||
|
0xaa22aaaa, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||||
|
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
|
||||||
|
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize3xUp_l13_4x4 = { {
|
||||||
|
0x00551155, // TCfg
|
||||||
|
0x00501050, // ASelt
|
||||||
|
0x05055454, 0x00006565, // ABin
|
||||||
|
0x00aa22aa, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||||
|
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||||
|
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
|
||||||
|
if (!is_same_type)
|
||||||
|
{
|
||||||
|
float f2i_radio = 256.0f;
|
||||||
|
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
|
||||||
|
multAndoutZP[0] = (uint32_t)(M0);
|
||||||
|
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
|
||||||
|
|
||||||
|
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
|
||||||
|
uniResize3xUp_l00_2x8.data[7] = 0x00000608;
|
||||||
|
uniResize3xUp_l01_2x8.data[7] = 0x00000608;
|
||||||
|
uniResize3xUp_l10_4x4.data[7] = 0x00000607;
|
||||||
|
uniResize3xUp_l11_4x4.data[7] = 0x00000607;
|
||||||
|
uniResize3xUp_l12_4x4.data[7] = 0x00000607;
|
||||||
|
uniResize3xUp_l13_4x4.data[7] = 0x00000607;
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
|
||||||
|
&uniU8PostProcess_2x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
else if (is_4x_up_kernel)
|
||||||
|
{
|
||||||
|
gpu_dp_inst_t uniResize4xUp_l00_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||||
|
0x00000406, // AccumType, ConstantType, and PostShift
|
||||||
|
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||||
|
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize4xUp_l01_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||||
|
0x00000406, // AccumType, ConstantType, and PostShift
|
||||||
|
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||||
|
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize4xUp_l10_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||||
|
0x00000406, // AccumType, ConstantType, and PostShift
|
||||||
|
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||||
|
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize4xUp_l11_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||||
|
0x00000406, // AccumType, ConstantType, and PostShift
|
||||||
|
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||||
|
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
|
||||||
|
if (!is_same_type)
|
||||||
|
{
|
||||||
|
float f2i_radio = 64.0f;
|
||||||
|
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
|
||||||
|
multAndoutZP[0] = (uint32_t)(M0);
|
||||||
|
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
|
||||||
|
|
||||||
|
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
|
||||||
|
uniResize4xUp_l00_4x8.data[7] = 0x00000400;
|
||||||
|
uniResize4xUp_l01_4x8.data[7] = 0x00000400;
|
||||||
|
uniResize4xUp_l10_4x8.data[7] = 0x00000400;
|
||||||
|
uniResize4xUp_l11_4x8.data[7] = 0x00000400;
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
|
||||||
|
&uniU8PostProcess_2x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
else if (is_8x_up_kernel)
|
||||||
|
{
|
||||||
|
gpu_dp_inst_t uniResize8xUp_l00_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||||
|
0x00000708, // AccumType, ConstantType, and PostShift
|
||||||
|
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||||
|
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize8xUp_l01_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||||
|
0x00000708, // AccumType, ConstantType, and PostShift
|
||||||
|
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||||
|
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize8xUp_l10_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||||
|
0x00000708, // AccumType, ConstantType, and PostShift
|
||||||
|
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||||
|
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize8xUp_l11_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||||
|
0x00000708, // AccumType, ConstantType, and PostShift
|
||||||
|
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||||
|
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize8xUp_l20_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||||
|
0x00000708, // AccumType, ConstantType, and PostShift
|
||||||
|
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||||
|
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize8xUp_l21_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||||
|
0x00000708, // AccumType, ConstantType, and PostShift
|
||||||
|
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||||
|
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize8xUp_l30_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||||
|
0x00000708, // AccumType, ConstantType, and PostShift
|
||||||
|
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||||
|
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniResize8xUp_l31_4x8 = { {
|
||||||
|
0x55555555, 0x55555555, // TCfg
|
||||||
|
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||||
|
0x00000708, // AccumType, ConstantType, and PostShift
|
||||||
|
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||||
|
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
|
||||||
|
if (!is_same_type)
|
||||||
|
{
|
||||||
|
float f2i_radio = 256.0f;
|
||||||
|
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
|
||||||
|
multAndoutZP[0] = (uint32_t)(M0);
|
||||||
|
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
|
||||||
|
|
||||||
|
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
|
||||||
|
uniResize8xUp_l00_4x8.data[7] = 0x00000700;
|
||||||
|
uniResize8xUp_l01_4x8.data[7] = 0x00000700;
|
||||||
|
uniResize8xUp_l10_4x8.data[7] = 0x00000700;
|
||||||
|
uniResize8xUp_l11_4x8.data[7] = 0x00000700;
|
||||||
|
uniResize8xUp_l20_4x8.data[7] = 0x00000700;
|
||||||
|
uniResize8xUp_l21_4x8.data[7] = 0x00000700;
|
||||||
|
uniResize8xUp_l30_4x8.data[7] = 0x00000700;
|
||||||
|
uniResize8xUp_l31_4x8.data[7] = 0x00000700;
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
|
||||||
|
&uniU8PostProcess_2x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
@ -1193,22 +1345,22 @@ static vsi_status _query_kernel
|
||||||
|
|
||||||
if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
|
if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
|
||||||
{
|
{
|
||||||
if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
|
if ((!align_corners) && (half_pixel_centers) && is_2x_upsample)
|
||||||
{
|
{
|
||||||
scale_flag = UP_2X_HALF;
|
scale_flag = UP_2X_HALF;
|
||||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||||
}
|
}
|
||||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
|
else if ((!align_corners) && (half_pixel_centers) && is_3x_upsample)
|
||||||
{
|
{
|
||||||
scale_flag = UP_3X_HALF;
|
scale_flag = UP_3X_HALF;
|
||||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||||
}
|
}
|
||||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
|
else if ((!align_corners) && (half_pixel_centers) && is_4x_upsample)
|
||||||
{
|
{
|
||||||
scale_flag = UP_4X_HALF;
|
scale_flag = UP_4X_HALF;
|
||||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||||
}
|
}
|
||||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
|
else if ((!align_corners) && (half_pixel_centers) && is_8x_upsample)
|
||||||
{
|
{
|
||||||
scale_flag = UP_8X_HALF;
|
scale_flag = UP_8X_HALF;
|
||||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||||
|
|
@ -1232,7 +1384,7 @@ static vsi_status _query_kernel
|
||||||
scale_flag = DOWN;
|
scale_flag = DOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
|
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
|
||||||
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||||
{
|
{
|
||||||
if( kernel_map[i].key == key )
|
if( kernel_map[i].key == key )
|
||||||
|
|
@ -1244,7 +1396,7 @@ static vsi_status _query_kernel
|
||||||
if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
|
if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
|
||||||
{
|
{
|
||||||
scale_flag = UP;
|
scale_flag = UP;
|
||||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
|
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
|
||||||
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||||
{
|
{
|
||||||
if( kernel_map[i].key == key )
|
if( kernel_map[i].key == key )
|
||||||
|
|
@ -1257,7 +1409,7 @@ static vsi_status _query_kernel
|
||||||
if ((UP == scale_flag) && (i >= kernel_map_size))
|
if ((UP == scale_flag) && (i >= kernel_map_size))
|
||||||
{
|
{
|
||||||
scale_flag = DOWN;
|
scale_flag = DOWN;
|
||||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
|
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
|
||||||
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||||
{
|
{
|
||||||
if( kernel_map[i].key == key )
|
if( kernel_map[i].key == key )
|
||||||
|
|
@ -1433,7 +1585,7 @@ static vsi_bool _is_image_width_lt16
|
||||||
size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
|
size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
|
||||||
vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;
|
vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;
|
||||||
|
|
||||||
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
|
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
|
||||||
{
|
{
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
@ -1468,7 +1620,8 @@ static vsi_nn_kernel_node_t _setup
|
||||||
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
|
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
|
||||||
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
|
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
|
||||||
vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]);
|
vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]);
|
||||||
vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
|
vsi_bool is_evis2 = \
|
||||||
|
(vsi_bool)(((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_2);
|
||||||
vsi_bool is_run_opt_kernel = FALSE;
|
vsi_bool is_run_opt_kernel = FALSE;
|
||||||
vsi_nn_tensor_t* scale = NULL;
|
vsi_nn_tensor_t* scale = NULL;
|
||||||
int32_t pad_left = half_pixel_centers ? 1 : 0;
|
int32_t pad_left = half_pixel_centers ? 1 : 0;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,744 @@
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (c) 2020 Vivante Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_tensor.h"
|
||||||
|
#include "vsi_nn_graph.h"
|
||||||
|
#include "vsi_nn_log.h"
|
||||||
|
#include "vsi_nn_error.h"
|
||||||
|
#include "vsi_nn_prv.h"
|
||||||
|
#include "vsi_nn_tensor_util.h"
|
||||||
|
#include "utils/vsi_nn_util.h"
|
||||||
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
|
||||||
|
__BEGIN_DECLS
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Define kernel meta.
|
||||||
|
B---batch
|
||||||
|
N---num_heads
|
||||||
|
S---sequence length
|
||||||
|
H---head size
|
||||||
|
*/
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
LAYOUT_NONE,
|
||||||
|
LAYOUT_BNHS,
|
||||||
|
LAYOUT_BNH1,
|
||||||
|
LAYOUT_BSNH,
|
||||||
|
LAYOUT_BNSH,
|
||||||
|
} _internal_rope_layout_e;
|
||||||
|
|
||||||
|
// Add kernel hashtable here
|
||||||
|
#define STR(a) #a
|
||||||
|
#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT, INTERLEAVED ) \
|
||||||
|
((IN0_DTYPE) | (IN1_DTYPE << 8) | (OUT_DTYPE << 16) | (LAYOUT << 24) | (INTERLEAVED << 28))
|
||||||
|
#define PACK_KERNEL_BNHS_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNHS, 0 ), \
|
||||||
|
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnhs"), \
|
||||||
|
"rope_0" }
|
||||||
|
#define PACK_KERNEL_BNH1_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNH1, 0 ), \
|
||||||
|
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnh1"), \
|
||||||
|
"rope_1" }
|
||||||
|
|
||||||
|
#define PACK_KERNEL_BSNH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BSNH, 1 ), \
|
||||||
|
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bsnh"), \
|
||||||
|
"rope_2" }
|
||||||
|
|
||||||
|
#define PACK_KERNEL_BNSH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||||
|
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNSH, 1 ), \
|
||||||
|
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnsh"), \
|
||||||
|
"rope_3" }
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
uint32_t key;
|
||||||
|
char * function_name;
|
||||||
|
const char * source_name;
|
||||||
|
} _kernel_map_type;
|
||||||
|
|
||||||
|
#define PACK_KERNEL_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
|
||||||
|
PACK_KERNEL_BNHS_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
|
||||||
|
PACK_KERNEL_BNH1_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
|
||||||
|
PACK_KERNEL_BSNH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
|
||||||
|
PACK_KERNEL_BNSH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE),
|
||||||
|
|
||||||
|
static const _kernel_map_type _rope_kernel_map[] =
|
||||||
|
{
|
||||||
|
// Register kernel here
|
||||||
|
PACK_KERNEL_MAP( BF16, BF16, BF16)
|
||||||
|
PACK_KERNEL_MAP( F16, F16, F16 )
|
||||||
|
PACK_KERNEL_MAP( I16, I16, I16 )
|
||||||
|
PACK_KERNEL_MAP( I16, F16, I16 )
|
||||||
|
PACK_KERNEL_MAP( I16, I16, I8 )
|
||||||
|
PACK_KERNEL_MAP( I16, F16, I8 )
|
||||||
|
PACK_KERNEL_MAP( I16, I16, U8 )
|
||||||
|
PACK_KERNEL_MAP( I16, F16, U8 )
|
||||||
|
PACK_KERNEL_MAP( U16, U16, U16 )
|
||||||
|
PACK_KERNEL_MAP( U16, F16, U16 )
|
||||||
|
PACK_KERNEL_MAP( I8, I8, I8 )
|
||||||
|
PACK_KERNEL_MAP( I8, F16, I8 )
|
||||||
|
PACK_KERNEL_MAP( U8, U8, U8 )
|
||||||
|
PACK_KERNEL_MAP( U8, F16, U8 )
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Kernel params
|
||||||
|
*/
|
||||||
|
static vx_param_description_t _rope_kernel_param_def[] =
|
||||||
|
{
|
||||||
|
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||||
|
};
|
||||||
|
#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def )
|
||||||
|
#define SCALAR_AXIS (4)
|
||||||
|
/*
|
||||||
|
* Kernel initializer
|
||||||
|
*/
|
||||||
|
DEF_KERNEL_INITIALIZER(_rope_initializer)
|
||||||
|
(
|
||||||
|
vsi_nn_kernel_node_t node,
|
||||||
|
const vsi_nn_kernel_node_param_t * param,
|
||||||
|
size_t param_size
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
gpu_param_t gpu_param = {
|
||||||
|
3,
|
||||||
|
{0, 0, 0},
|
||||||
|
{0, 0, 0},
|
||||||
|
{0, 0, 0},
|
||||||
|
{0, 0, 0}
|
||||||
|
};
|
||||||
|
vsi_nn_kernel_tensor_attr_t* out_attr = NULL;
|
||||||
|
vsi_nn_kernel_tensor_attr_t* in0_attr = NULL;
|
||||||
|
vsi_nn_kernel_tensor_attr_t* in1_attr = NULL;
|
||||||
|
vsi_nn_kernel_tensor_attr_t* in2_attr = NULL;
|
||||||
|
vsi_size_array_t* in_shape = NULL;
|
||||||
|
vsi_nn_kernel_dtype_e in0_dtype = F16;
|
||||||
|
vsi_nn_kernel_dtype_e in1_dtype = F16;
|
||||||
|
vsi_nn_kernel_dtype_e in2_dtype = F16;
|
||||||
|
vsi_nn_kernel_dtype_e out_dtype = F16;
|
||||||
|
float in0_scale = 1.0f;
|
||||||
|
float in1_scale = 1.0f;
|
||||||
|
float in2_scale = 1.0f;
|
||||||
|
float output_scale = 1.0f;
|
||||||
|
float output_zp = 0;
|
||||||
|
int32_t in0_zp = 0;
|
||||||
|
int32_t cos_zp = 0;
|
||||||
|
int32_t sin_zp = 0;
|
||||||
|
int32_t p = 0;
|
||||||
|
int32_t axis = 0;
|
||||||
|
int32_t interleaved = 0;
|
||||||
|
int32_t half_head_size = 1;
|
||||||
|
vsi_size_t shape[3] = {1};
|
||||||
|
uint32_t pack_key = 0;
|
||||||
|
|
||||||
|
VSI_UNREFERENCED(node);
|
||||||
|
VSI_UNREFERENCED(param);
|
||||||
|
VSI_UNREFERENCED(param_size);
|
||||||
|
// Add initializer
|
||||||
|
|
||||||
|
in0_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
|
||||||
|
CHECK_PTR_FAIL_GOTO(in0_attr, "Create tensor attr buffer fail.", final);
|
||||||
|
in1_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
|
||||||
|
CHECK_PTR_FAIL_GOTO(in1_attr, "Create tensor attr buffer fail.", final);
|
||||||
|
in2_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
|
||||||
|
CHECK_PTR_FAIL_GOTO(in2_attr, "Create tensor attr buffer fail.", final);
|
||||||
|
out_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[3]);
|
||||||
|
CHECK_PTR_FAIL_GOTO(out_attr, "Create tensor attr buffer fail.", final);
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &p);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
|
||||||
|
axis = p & 0xFFFF;
|
||||||
|
interleaved = (p >> 16) & 0xFFFF;
|
||||||
|
|
||||||
|
in_shape = in0_attr->shape;
|
||||||
|
in0_dtype = in0_attr->dtype;
|
||||||
|
in1_dtype = in1_attr->dtype;
|
||||||
|
in2_dtype = in2_attr->dtype;
|
||||||
|
out_dtype = out_attr->dtype;
|
||||||
|
|
||||||
|
in0_scale = in0_attr->scale;
|
||||||
|
in1_scale = in1_attr->scale;
|
||||||
|
in2_scale = in2_attr->scale;
|
||||||
|
in0_zp = -in0_attr->zero_point;
|
||||||
|
cos_zp = -in1_attr->zero_point;
|
||||||
|
sin_zp = -in2_attr->zero_point;
|
||||||
|
output_scale = out_attr->scale;
|
||||||
|
output_zp = (float)out_attr->zero_point;
|
||||||
|
|
||||||
|
half_head_size = (int32_t)(in_shape->data[axis] / 2);
|
||||||
|
shape[0] = in_shape->data[0];
|
||||||
|
shape[1] = in_shape->data[1];
|
||||||
|
shape[2] = in_shape->data[2];
|
||||||
|
shape[axis] = half_head_size;
|
||||||
|
|
||||||
|
gpu_param.global_scale[0] = 8;
|
||||||
|
gpu_param.global_scale[1] = 1;
|
||||||
|
gpu_param.global_scale[2] = 1;
|
||||||
|
gpu_param.global_size[0] = gpu_align_p2((shape[0] + \
|
||||||
|
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
|
||||||
|
gpu_param.global_size[1] = shape[1];
|
||||||
|
gpu_param.global_size[2] = shape[2];
|
||||||
|
|
||||||
|
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \
|
||||||
|
((IN0_TYPE) | (IN1_TYPE << 8) | (IN2_TYPE << 16) | (OUT_TYPE << 24))
|
||||||
|
|
||||||
|
pack_key = _PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype);
|
||||||
|
switch (pack_key)
|
||||||
|
{
|
||||||
|
case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
|
||||||
|
{
|
||||||
|
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = { {
|
||||||
|
0x11111111, // TCfg
|
||||||
|
0x01010101, // ASelt
|
||||||
|
0x01050004, 0x03070206, // ABin
|
||||||
|
0x22222222, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00000600, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = { {
|
||||||
|
0x11111111, // TCfg
|
||||||
|
0x01010101, // ASelt
|
||||||
|
0x05050404, 0x07070606, // ABin
|
||||||
|
0x22222222, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00000600, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniExtractOddData_2x8 = { {
|
||||||
|
0x11111111, // TCfg
|
||||||
|
0x11110000, // ASelt
|
||||||
|
0x07050301, 0x07050301, // ABin
|
||||||
|
0x22222222, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00000600, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
|
||||||
|
if (interleaved && axis == 0)
|
||||||
|
{
|
||||||
|
uniExtractOddData_2x8.data[1] = 0x10101010;
|
||||||
|
uniExtractOddData_2x8.data[2] = 0x03030101;
|
||||||
|
uniExtractOddData_2x8.data[3] = 0x07070505;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"half_head_size", &half_head_size);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case _PACK_SELECT_KEY(I16, I16, I16, I16):
|
||||||
|
case _PACK_SELECT_KEY(I16, F16, F16, I16):
|
||||||
|
case _PACK_SELECT_KEY(I16, I16, I16, I8):
|
||||||
|
case _PACK_SELECT_KEY(I16, F16, F16, I8):
|
||||||
|
case _PACK_SELECT_KEY(I16, I16, I16, U8):
|
||||||
|
case _PACK_SELECT_KEY(I16, F16, F16, U8):
|
||||||
|
case _PACK_SELECT_KEY(F16, F16, F16, F16):
|
||||||
|
{
|
||||||
|
float scale0 = in0_scale * in1_scale / output_scale;
|
||||||
|
float scale1 = in0_scale* in2_scale / output_scale;
|
||||||
|
gpu_dp_inst_t uniExtractHalf8_2x8 = { {
|
||||||
|
0x11111111, // TCfg
|
||||||
|
0x11110000, // ASelt
|
||||||
|
0x06040200, 0x06040200, // ABin
|
||||||
|
0x22222222, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00000100, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||||
|
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniExtractInteger_2x8 = { {
|
||||||
|
0x33333333, // TCfg
|
||||||
|
0x11110000, // ASelt
|
||||||
|
0x03020100, 0x03020100, // ABin
|
||||||
|
0x00000000, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00002400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniATimesB_0_4x4 = { {
|
||||||
|
0x01010101, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00010000, 0x00030002, // ABin
|
||||||
|
0x01010101, // BSelt
|
||||||
|
0x00010000, 0x00030002, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniATimesB_1_4x4 = { {
|
||||||
|
0x01010101, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00050004, 0x00070006, // ABin
|
||||||
|
0x01010101, // BSelt
|
||||||
|
0x00050004, 0x00070006, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniAEvenTimesB_0_4x4 = { {
|
||||||
|
0x01010101, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00020000, 0x00060004, // ABin
|
||||||
|
0x01010101, // BSelt
|
||||||
|
0x00010000, 0x00030002, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniAEvenTimesB_1_4x4 = { {
|
||||||
|
0x01010101, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00020000, 0x00060004, // ABin
|
||||||
|
0x01010101, // BSelt
|
||||||
|
0x00050004, 0x00070006, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniAOddTimesB_0_4x4 = { {
|
||||||
|
0x01010101, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00030001, 0x00070005, // ABin
|
||||||
|
0x01010101, // BSelt
|
||||||
|
0x00010000, 0x00030002, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniAOddTimesB_1_4x4 = { {
|
||||||
|
0x01010101, // TCfg
|
||||||
|
0x00000000, // ASelt
|
||||||
|
0x00030001, 0x00070005, // ABin
|
||||||
|
0x01010101, // BSelt
|
||||||
|
0x00050004, 0x00070006, // BBin
|
||||||
|
0x00000400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
|
||||||
|
if (interleaved && axis == 0)
|
||||||
|
{
|
||||||
|
uniExtractHalf8_2x8.data[1] = 0x10101010;
|
||||||
|
uniExtractHalf8_2x8.data[2] = 0x02020000;
|
||||||
|
uniExtractHalf8_2x8.data[3] = 0x06060404;
|
||||||
|
uniExtractInteger_2x8.data[1] = 0x10101010;
|
||||||
|
uniExtractInteger_2x8.data[2] = 0x01010000;
|
||||||
|
uniExtractInteger_2x8.data[3] = 0x03030202;
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniAEvenTimesB_0_4x4", &uniAEvenTimesB_0_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniAEvenTimesB_1_4x4", &uniAEvenTimesB_1_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniAOddTimesB_0_4x4", &uniAOddTimesB_0_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniAOddTimesB_1_4x4", &uniAOddTimesB_1_4x4);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniATimesB_0_4x4", &uniATimesB_0_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniATimesB_1_4x4", &uniATimesB_1_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"half_head_size", &half_head_size);
|
||||||
|
}
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"scale0", &scale0);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"scale1", &scale1);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"output_zp", &output_zp);
|
||||||
|
if (out_dtype == F16)
|
||||||
|
{
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniExtract8Data_2x8", &uniExtractHalf8_2x8);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniExtract8Data_2x8", &uniExtractInteger_2x8);
|
||||||
|
}
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case _PACK_SELECT_KEY(I8, I8, I8, I8):
|
||||||
|
case _PACK_SELECT_KEY(U8, U8, U8, U8):
|
||||||
|
case _PACK_SELECT_KEY(U16, U16, U16, U16):
|
||||||
|
case _PACK_SELECT_KEY(I8, F16, F16, I8):
|
||||||
|
case _PACK_SELECT_KEY(U8, F16, F16, U8):
|
||||||
|
case _PACK_SELECT_KEY(U16, F16, F16, U16):
|
||||||
|
{
|
||||||
|
float scale0 = in0_scale * in1_scale / output_scale;
|
||||||
|
float scale1 = in0_scale* in2_scale / output_scale;
|
||||||
|
gpu_dp_inst_t uniExtractInteger_2x8 = { {
|
||||||
|
0x33333333, // TCfg
|
||||||
|
0x11110000, // ASelt
|
||||||
|
0x03020100, 0x03020100, // ABin
|
||||||
|
0x00000000, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00002400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniAMinusZp_0_4x4 = { {
|
||||||
|
0x0d0d0d0d, // TCfg
|
||||||
|
0x04040404, // ASelt
|
||||||
|
0x00010000, 0x00030002, // ABin
|
||||||
|
0x02020202, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00002400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniAMinusZp_1_4x4 = { {
|
||||||
|
0x0d0d0d0d, // TCfg
|
||||||
|
0x04040404, // ASelt
|
||||||
|
0x00050004, 0x00070006, // ABin
|
||||||
|
0x02020202, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00002400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniAEvenMinusZp_4x4 = { {
|
||||||
|
0x0d0d0d0d, // TCfg
|
||||||
|
0x04040404, // ASelt
|
||||||
|
0x00020000, 0x00060004, // ABin
|
||||||
|
0x02020202, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00002400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
gpu_dp_inst_t uniAOddMinusZp_4x4 = { {
|
||||||
|
0x0d0d0d0d, // TCfg
|
||||||
|
0x04040404, // ASelt
|
||||||
|
0x00030001, 0x00070005, // ABin
|
||||||
|
0x02020202, // BSelt
|
||||||
|
0x00000000, 0x00000000, // BBin
|
||||||
|
0x00002400, // AccumType, ConstantType, and PostShift
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||||
|
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||||
|
}, GPU_DP_TYPE_16 };
|
||||||
|
|
||||||
|
if (interleaved && axis == 0)
|
||||||
|
{
|
||||||
|
uniExtractInteger_2x8.data[1] = 0x10101010;
|
||||||
|
uniExtractInteger_2x8.data[2] = 0x01010000;
|
||||||
|
uniExtractInteger_2x8.data[3] = 0x03030202;
|
||||||
|
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniAEvenMinusZp_4x4", &uniAEvenMinusZp_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniAOddMinusZp_4x4", &uniAOddMinusZp_4x4);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
status = vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"half_head_size", &half_head_size);
|
||||||
|
}
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniAMinusZp_0_4x4", &uniAMinusZp_0_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniAMinusZp_1_4x4", &uniAMinusZp_1_4x4);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"scale0", &scale0);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"scale1", &scale1);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"output_zp", &output_zp);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"in0_zp", &in0_zp);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"cos_zp", &cos_zp);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"sin_zp", &sin_zp);
|
||||||
|
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||||
|
"uniExtract8Data_2x8", &uniExtractInteger_2x8);
|
||||||
|
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
|
||||||
|
final:
|
||||||
|
if (in0_attr) vsi_nn_kernel_tensor_attr_release(&in0_attr);
|
||||||
|
if (in1_attr) vsi_nn_kernel_tensor_attr_release(&in1_attr);
|
||||||
|
if (in2_attr) vsi_nn_kernel_tensor_attr_release(&in2_attr);
|
||||||
|
if (out_attr) vsi_nn_kernel_tensor_attr_release(&out_attr);
|
||||||
|
return status;
|
||||||
|
} /* _rope_initializer() */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Query kernel
|
||||||
|
*/
|
||||||
|
static vsi_status _query_kernel
|
||||||
|
(
|
||||||
|
vsi_nn_kernel_t * kernel,
|
||||||
|
vsi_nn_tensor_t * const * const inputs,
|
||||||
|
vsi_nn_tensor_t * const * const outputs,
|
||||||
|
int32_t axis,
|
||||||
|
int32_t interleaved,
|
||||||
|
_internal_rope_layout_e *layout
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
vsi_nn_kernel_dtype_e in0_dtype;
|
||||||
|
vsi_nn_kernel_dtype_e in1_dtype;
|
||||||
|
vsi_nn_kernel_dtype_e in2_dtype;
|
||||||
|
vsi_nn_kernel_dtype_e out_dtype;
|
||||||
|
int32_t in0_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||||
|
int32_t in1_zp = vsi_nn_get_tensor_zero_point(inputs[1]);
|
||||||
|
int32_t in2_zp = vsi_nn_get_tensor_zero_point(inputs[2]);
|
||||||
|
const _kernel_map_type * kernel_map = _rope_kernel_map;
|
||||||
|
size_t kernel_map_size = _cnt_of_array( _rope_kernel_map );
|
||||||
|
vx_param_description_t * param_def = _rope_kernel_param_def;
|
||||||
|
vx_kernel_initialize_f initializer = _rope_initializer;
|
||||||
|
|
||||||
|
uint32_t key;
|
||||||
|
uint32_t i;
|
||||||
|
|
||||||
|
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||||
|
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||||
|
in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||||
|
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||||
|
|
||||||
|
/*only support symmetric int16*/
|
||||||
|
if ( ( (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I16) ||
|
||||||
|
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == I16) ||
|
||||||
|
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == I8) ||
|
||||||
|
(in0_dtype == I16 && in1_dtype == I16 && out_dtype == I8) ||
|
||||||
|
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == U8) ||
|
||||||
|
(in0_dtype == I16 && in1_dtype == I16 && out_dtype == U8) ) &&
|
||||||
|
(in0_zp != 0 || in1_zp != 0 || in2_zp != 0))
|
||||||
|
{
|
||||||
|
return VSI_FAILURE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (axis == 1 && inputs[0]->attr.size[0] == inputs[1]->attr.size[0] &&
|
||||||
|
in1_dtype == in2_dtype)
|
||||||
|
{
|
||||||
|
if (inputs[0]->attr.size[0] == 1)
|
||||||
|
{
|
||||||
|
*layout = LAYOUT_BNH1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*layout = LAYOUT_BNHS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axis == 0 && in1_dtype == in2_dtype)
|
||||||
|
{
|
||||||
|
if (inputs[0]->attr.size[2] == inputs[1]->attr.size[2] &&
|
||||||
|
inputs[1]->attr.size[1] == 1)
|
||||||
|
{
|
||||||
|
*layout = LAYOUT_BSNH;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*layout = LAYOUT_BNSH;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
key = ROPE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, *layout, interleaved);
|
||||||
|
|
||||||
|
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||||
|
{
|
||||||
|
if ( kernel_map[i].key == key )
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( i < (uint32_t)kernel_map_size )
|
||||||
|
{
|
||||||
|
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||||
|
kernel->info.parameters = param_def;
|
||||||
|
kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def );
|
||||||
|
kernel->info.initialize = initializer;
|
||||||
|
// Register code source
|
||||||
|
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||||
|
"vsi_nn_kernel_header",
|
||||||
|
kernel_map[i].source_name );
|
||||||
|
// Register binary source
|
||||||
|
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||||
|
kernel_map[i].source_name );
|
||||||
|
status = VSI_SUCCESS;
|
||||||
|
}
|
||||||
|
return status;
|
||||||
|
} /* _query_kernel() */
|
||||||
|
|
||||||
|
static vsi_nn_kernel_node_t _setup
|
||||||
|
(
|
||||||
|
vsi_nn_graph_t * graph,
|
||||||
|
vsi_nn_tensor_t ** inputs,
|
||||||
|
size_t input_num,
|
||||||
|
vsi_nn_tensor_t ** outputs,
|
||||||
|
size_t output_num,
|
||||||
|
const vsi_nn_kernel_param_t * params,
|
||||||
|
vsi_nn_kernel_t * kernel
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = { NULL };
|
||||||
|
vsi_nn_kernel_node_t node = NULL;
|
||||||
|
int32_t axis = 0;
|
||||||
|
int32_t i = 0;
|
||||||
|
int32_t interleaved = 0;
|
||||||
|
int32_t param = 0;
|
||||||
|
vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||||
|
vsi_nn_tensor_t* rs_tensors[4] = { NULL };
|
||||||
|
vsi_nn_tensor_t* reshape_tensors[4] = { NULL };
|
||||||
|
_internal_rope_layout_e layout = LAYOUT_NONE;
|
||||||
|
|
||||||
|
VSI_UNREFERENCED(params);
|
||||||
|
|
||||||
|
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||||
|
interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
|
||||||
|
|
||||||
|
// Check if gpu can support the size
|
||||||
|
if ( !vsi_nn_kernel_gpu_check_shape(
|
||||||
|
inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
status = _query_kernel( kernel, inputs, outputs, axis, interleaved, &layout );
|
||||||
|
if (outputs[0]->attr.size[0] == 1 || layout == LAYOUT_BSNH)
|
||||||
|
{
|
||||||
|
memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
|
||||||
|
memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
|
||||||
|
memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
|
||||||
|
|
||||||
|
if (outputs[0]->attr.size[0] == 1)
|
||||||
|
{
|
||||||
|
for (i = 1; i < 3; i++)
|
||||||
|
{
|
||||||
|
shape[0][i - 1] = shape[0][i];
|
||||||
|
shape[1][i - 1] = shape[1][i];
|
||||||
|
shape[2][i - 1] = shape[2][i];
|
||||||
|
}
|
||||||
|
shape[0][2] = 1;
|
||||||
|
shape[1][2] = 1;
|
||||||
|
shape[2][2] = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int32_t j = 0;
|
||||||
|
for (i = 0; i < 3; i++)
|
||||||
|
{
|
||||||
|
if (shape[1][i] != 1)
|
||||||
|
{
|
||||||
|
shape[1][j] = shape[1][i];
|
||||||
|
j ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (; j < 3; j++)
|
||||||
|
{
|
||||||
|
shape[1][j] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
rs_tensors[0] = vsi_nn_reshape_tensor(graph,
|
||||||
|
inputs[0], shape[0], inputs[0]->attr.dim_num);
|
||||||
|
rs_tensors[1] = vsi_nn_reshape_tensor(graph,
|
||||||
|
inputs[1], shape[1], inputs[1]->attr.dim_num);
|
||||||
|
rs_tensors[2] = vsi_nn_reshape_tensor(graph,
|
||||||
|
inputs[2], shape[1], inputs[2]->attr.dim_num);
|
||||||
|
rs_tensors[3] = vsi_nn_reshape_tensor(graph,
|
||||||
|
outputs[0], shape[2], outputs[0]->attr.dim_num);
|
||||||
|
|
||||||
|
if (outputs[0]->attr.size[0] == 1 && axis > 0)
|
||||||
|
{
|
||||||
|
axis--;
|
||||||
|
}
|
||||||
|
reshape_tensors[0] = rs_tensors[0];
|
||||||
|
reshape_tensors[1] = rs_tensors[1];
|
||||||
|
reshape_tensors[2] = rs_tensors[2];
|
||||||
|
reshape_tensors[3] = rs_tensors[3];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
reshape_tensors[0] = inputs[0];
|
||||||
|
reshape_tensors[1] = inputs[1];
|
||||||
|
reshape_tensors[2] = inputs[2];
|
||||||
|
reshape_tensors[3] = outputs[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
param = (interleaved << 16) | axis;
|
||||||
|
if ( VSI_SUCCESS == status)
|
||||||
|
{
|
||||||
|
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||||
|
if ( node )
|
||||||
|
{
|
||||||
|
/* Set inputs and outputs */
|
||||||
|
vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
|
||||||
|
reshape_tensors, input_num, &reshape_tensors[3], output_num );
|
||||||
|
/* Pass parameters to node. */
|
||||||
|
node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, ¶m);
|
||||||
|
status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
|
||||||
|
vsi_nn_kernel_scalar_release(&node_params[SCALAR_AXIS]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i++)
|
||||||
|
{
|
||||||
|
vsi_safe_release_tensor(rs_tensors[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return node;
|
||||||
|
} /* _setup() */
|
||||||
|
|
||||||
|
__END_DECLS
|
||||||
|
|
||||||
|
REGISTER_BACKEND_EVIS( rope, _setup )
|
||||||
|
|
||||||
|
|
@ -186,18 +186,26 @@ static const _kernel_map_type scatter_nd_update_special_ref_map[] =
|
||||||
{
|
{
|
||||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
|
||||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
|
||||||
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
|
||||||
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
|
||||||
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4)
|
||||||
};
|
};
|
||||||
|
|
||||||
static const _kernel_map_type scatter_nd_update_special_update_map[] =
|
static const _kernel_map_type scatter_nd_update_special_update_map[] =
|
||||||
{
|
{
|
||||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
|
||||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
|
||||||
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
|
||||||
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
|
||||||
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4)
|
||||||
};
|
};
|
||||||
|
|
||||||
static const _kernel_map_type scatter_nd_update_special_copy_map[] =
|
static const _kernel_map_type scatter_nd_update_special_copy_map[] =
|
||||||
{
|
{
|
||||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
|
||||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
|
||||||
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
|
||||||
|
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
@ -563,6 +571,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
|
||||||
{
|
{
|
||||||
case _PACK_SELECT_KEY( I8, I8 ):
|
case _PACK_SELECT_KEY( I8, I8 ):
|
||||||
case _PACK_SELECT_KEY( U8, U8 ):
|
case _PACK_SELECT_KEY( U8, U8 ):
|
||||||
|
case _PACK_SELECT_KEY( I16, I16 ):
|
||||||
|
case _PACK_SELECT_KEY( U16, U16 ):
|
||||||
{
|
{
|
||||||
uint16_t M0 = 0;
|
uint16_t M0 = 0;
|
||||||
int32_t postShift0 = 0;
|
int32_t postShift0 = 0;
|
||||||
|
|
@ -605,6 +615,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
|
||||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case _PACK_SELECT_KEY( F16, F16 ):
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -759,6 +771,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
|
||||||
{
|
{
|
||||||
case _PACK_SELECT_KEY( I8, I8 ):
|
case _PACK_SELECT_KEY( I8, I8 ):
|
||||||
case _PACK_SELECT_KEY( U8, U8 ):
|
case _PACK_SELECT_KEY( U8, U8 ):
|
||||||
|
case _PACK_SELECT_KEY( I16, I16 ):
|
||||||
|
case _PACK_SELECT_KEY( U16, U16 ):
|
||||||
{
|
{
|
||||||
uint16_t M1 = 0;
|
uint16_t M1 = 0;
|
||||||
int32_t postShift1 = 0;
|
int32_t postShift1 = 0;
|
||||||
|
|
@ -801,6 +815,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
|
||||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case _PACK_SELECT_KEY( F16, F16 ):
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
@ -1597,6 +1613,19 @@ static vsi_status _query_kernel_special
|
||||||
status |= VSI_FAILURE;
|
status |= VSI_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (input0_dtype == F16)
|
||||||
|
{
|
||||||
|
input0_dtype = U16;
|
||||||
|
}
|
||||||
|
if (input2_dtype == F16)
|
||||||
|
{
|
||||||
|
input2_dtype = U16;
|
||||||
|
}
|
||||||
|
if (output_dtype == F16)
|
||||||
|
{
|
||||||
|
output_dtype = U16;
|
||||||
|
}
|
||||||
|
|
||||||
key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);
|
key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);
|
||||||
|
|
||||||
for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )
|
for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_tensor.h"
|
#include "vsi_nn_tensor.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
|
|
@ -591,7 +592,7 @@ static vsi_nn_kernel_node_t _setup
|
||||||
VSI_UNREFERENCED(input_num);
|
VSI_UNREFERENCED(input_num);
|
||||||
VSI_UNREFERENCED(output_num);
|
VSI_UNREFERENCED(output_num);
|
||||||
#if (VX_ACTIVATION_EXT_SUPPORT)
|
#if (VX_ACTIVATION_EXT_SUPPORT)
|
||||||
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
|
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
|
||||||
{
|
{
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -548,16 +548,16 @@ static vsi_status _gpu_register
|
||||||
vsi_status status;
|
vsi_status status;
|
||||||
vx_kernel_description_t* info;
|
vx_kernel_description_t* info;
|
||||||
vx_kernel obj;
|
vx_kernel obj;
|
||||||
vsi_nn_context_t context;
|
|
||||||
vx_program program = NULL;
|
vx_program program = NULL;
|
||||||
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
|
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
|
||||||
|
vsi_nn_runtime_option_t* options;
|
||||||
|
options = ((vsi_nn_graph_prv_t*)graph)->options;
|
||||||
|
|
||||||
#define MAX_BUILDPROGRAM_LEN 1024
|
#define MAX_BUILDPROGRAM_LEN 1024
|
||||||
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
|
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
|
||||||
size_t cost_bytes = 0;
|
size_t cost_bytes = 0;
|
||||||
|
|
||||||
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
|
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
|
||||||
context = graph->ctx;
|
|
||||||
|
|
||||||
status = VSI_FAILURE;
|
status = VSI_FAILURE;
|
||||||
info = &(kernel->info);
|
info = &(kernel->info);
|
||||||
|
|
@ -579,21 +579,21 @@ static vsi_status _gpu_register
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
|
if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
|
||||||
{
|
{
|
||||||
// set default evis version is 2
|
// set default evis version is 2
|
||||||
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
|
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
|
||||||
{
|
{
|
||||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
|
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
|
||||||
context->config.use_40bits_va );
|
options->config.use_40bits_va );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
|
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
|
||||||
context->config.evis.ver, context->config.use_40bits_va );
|
options->config.evis.ver, options->config.use_40bits_va );
|
||||||
}
|
}
|
||||||
// Pack build option
|
// Pack build option
|
||||||
if( kernel->gpu.sources[active_fmt].build_option.data )
|
if( kernel->gpu.sources[active_fmt].build_option.data )
|
||||||
|
|
@ -655,16 +655,16 @@ static vsi_status _gpu_register_ext
|
||||||
vsi_status status;
|
vsi_status status;
|
||||||
vx_kernel_description_t* info;
|
vx_kernel_description_t* info;
|
||||||
vx_kernel obj;
|
vx_kernel obj;
|
||||||
vsi_nn_context_t context;
|
|
||||||
vx_program program = NULL;
|
vx_program program = NULL;
|
||||||
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
|
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
|
||||||
|
vsi_nn_runtime_option_t* options;
|
||||||
|
options = ((vsi_nn_graph_prv_t*)graph)->options;
|
||||||
|
|
||||||
#define MAX_BUILDPROGRAM_LEN 1024
|
#define MAX_BUILDPROGRAM_LEN 1024
|
||||||
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
|
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
|
||||||
size_t cost_bytes = 0;
|
size_t cost_bytes = 0;
|
||||||
|
|
||||||
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
|
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
|
||||||
context = graph->ctx;
|
|
||||||
|
|
||||||
status = VSI_FAILURE;
|
status = VSI_FAILURE;
|
||||||
info = &(kernel->info);
|
info = &(kernel->info);
|
||||||
|
|
@ -686,21 +686,21 @@ static vsi_status _gpu_register_ext
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
|
if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
|
||||||
{
|
{
|
||||||
// set default evis version is 2
|
// set default evis version is 2
|
||||||
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
|
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
|
||||||
{
|
{
|
||||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
|
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
|
||||||
context->config.use_40bits_va );
|
options->config.use_40bits_va );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
|
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
|
||||||
context->config.evis.ver, context->config.use_40bits_va );
|
options->config.evis.ver, options->config.use_40bits_va );
|
||||||
}
|
}
|
||||||
// Pack build option
|
// Pack build option
|
||||||
if( kernel->gpu.sources[active_fmt].build_option.data )
|
if( kernel->gpu.sources[active_fmt].build_option.data )
|
||||||
|
|
@ -1258,7 +1258,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
|
||||||
}
|
}
|
||||||
/* Skip evis if not support */
|
/* Skip evis if not support */
|
||||||
if( type == VSI_NN_KERNEL_TYPE_EVIS
|
if( type == VSI_NN_KERNEL_TYPE_EVIS
|
||||||
&& graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE )
|
&& ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_NONE )
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -1677,7 +1677,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
|
||||||
int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
|
int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
|
||||||
|
|
||||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||||
if ( graph->ctx->config.subGroupSize == 0 )
|
if ( ((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize == 0 )
|
||||||
{
|
{
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -162,15 +162,11 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(pow)
|
||||||
#if (VX_TENSOR_GATHER_API_SUPPORT)
|
#if (VX_TENSOR_GATHER_API_SUPPORT)
|
||||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
|
REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
|
||||||
#endif
|
#endif
|
||||||
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
|
||||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
|
REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
|
||||||
#endif
|
|
||||||
#if (VX_TENSOR_TILE_API_SUPPORT)
|
#if (VX_TENSOR_TILE_API_SUPPORT)
|
||||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
|
REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
|
||||||
#endif
|
#endif
|
||||||
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
|
|
||||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm)
|
REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm)
|
||||||
#endif
|
|
||||||
#if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
|
#if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
|
||||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
|
REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -184,6 +180,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
|
||||||
#if (VX_BITCAST_VX_SUPPORT)
|
#if (VX_BITCAST_VX_SUPPORT)
|
||||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
|
REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
|
||||||
#endif
|
#endif
|
||||||
|
REGISTER_VX_FIRST_KERNEL_SELECTOR(group_norm)
|
||||||
|
REGISTER_VX_FIRST_KERNEL_SELECTOR(instance_norm)
|
||||||
|
|
||||||
__END_DECLS
|
__END_DECLS
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,89 @@
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (c) 2021 Vivante Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_tensor.h"
|
||||||
|
#include "vsi_nn_node.h"
|
||||||
|
#include "vsi_nn_log.h"
|
||||||
|
#include "vsi_nn_prv.h"
|
||||||
|
#include "vsi_nn_tensor_util.h"
|
||||||
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
|
||||||
|
#if VX_GROUP_NORMALIZATION_VX_SUPPORT
|
||||||
|
#define REGISTER_GROUP_NORM_OPENVX_KERNEL( kernel_name ) \
|
||||||
|
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||||
|
( \
|
||||||
|
vsi_nn_graph_t * graph, \
|
||||||
|
vsi_nn_tensor_t ** inputs, \
|
||||||
|
size_t input_num, \
|
||||||
|
vsi_nn_tensor_t ** outputs, \
|
||||||
|
size_t output_num,\
|
||||||
|
const vsi_nn_kernel_param_t * params, \
|
||||||
|
vsi_nn_kernel_t * kernel \
|
||||||
|
); \
|
||||||
|
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||||
|
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||||
|
( \
|
||||||
|
vsi_nn_graph_t * graph, \
|
||||||
|
vsi_nn_tensor_t ** inputs, \
|
||||||
|
size_t input_num, \
|
||||||
|
vsi_nn_tensor_t ** outputs, \
|
||||||
|
size_t output_num,\
|
||||||
|
const vsi_nn_kernel_param_t * params, \
|
||||||
|
vsi_nn_kernel_t * kernel \
|
||||||
|
)
|
||||||
|
|
||||||
|
REGISTER_GROUP_NORM_OPENVX_KERNEL(group_norm)
|
||||||
|
{
|
||||||
|
vx_node node = NULL;
|
||||||
|
float eps = vsi_nn_kernel_param_get_float32(params, "eps");
|
||||||
|
int32_t group_num = vsi_nn_kernel_param_get_int32(params, "group_num");
|
||||||
|
vx_tensor inputs_tensor[3] = { NULL };
|
||||||
|
vx_tensor output_tensor = NULL;
|
||||||
|
|
||||||
|
inputs_tensor[0] = inputs[0]->t;
|
||||||
|
inputs_tensor[1] = inputs[1]->t;
|
||||||
|
inputs_tensor[2] = inputs[2]->t;
|
||||||
|
output_tensor = outputs[0]->t;
|
||||||
|
|
||||||
|
VSI_UNREFERENCED(output_num);
|
||||||
|
VSI_UNREFERENCED(kernel);
|
||||||
|
|
||||||
|
if (graph->ctx->config.support_ffd ||
|
||||||
|
graph->ctx->config.support_stream_processor)
|
||||||
|
{
|
||||||
|
node = vxGroupNormalizationLayer(
|
||||||
|
graph->g,
|
||||||
|
eps,
|
||||||
|
group_num,
|
||||||
|
inputs_tensor,
|
||||||
|
(vx_uint32)input_num,
|
||||||
|
output_tensor
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (vsi_nn_kernel_node_t)node;
|
||||||
|
} /* group_norm() */
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -0,0 +1,87 @@
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (c) 2021 Vivante Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_tensor.h"
|
||||||
|
#include "vsi_nn_node.h"
|
||||||
|
#include "vsi_nn_log.h"
|
||||||
|
#include "vsi_nn_prv.h"
|
||||||
|
#include "vsi_nn_tensor_util.h"
|
||||||
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
|
||||||
|
#if VX_INSTANCE_NORMALIZATION_VX_SUPPORT
|
||||||
|
#define REGISTER_INSTANCE_NORM_OPENVX_KERNEL( kernel_name ) \
|
||||||
|
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||||
|
( \
|
||||||
|
vsi_nn_graph_t * graph, \
|
||||||
|
vsi_nn_tensor_t ** inputs, \
|
||||||
|
size_t input_num, \
|
||||||
|
vsi_nn_tensor_t ** outputs, \
|
||||||
|
size_t output_num,\
|
||||||
|
const vsi_nn_kernel_param_t * params, \
|
||||||
|
vsi_nn_kernel_t * kernel \
|
||||||
|
); \
|
||||||
|
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||||
|
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||||
|
( \
|
||||||
|
vsi_nn_graph_t * graph, \
|
||||||
|
vsi_nn_tensor_t ** inputs, \
|
||||||
|
size_t input_num, \
|
||||||
|
vsi_nn_tensor_t ** outputs, \
|
||||||
|
size_t output_num,\
|
||||||
|
const vsi_nn_kernel_param_t * params, \
|
||||||
|
vsi_nn_kernel_t * kernel \
|
||||||
|
)
|
||||||
|
|
||||||
|
REGISTER_INSTANCE_NORM_OPENVX_KERNEL(instance_norm)
|
||||||
|
{
|
||||||
|
vsi_nn_kernel_node_t node = NULL;
|
||||||
|
float eps = vsi_nn_kernel_param_get_float32(params, "eps");
|
||||||
|
vx_tensor inputs_tensor[3] = { NULL };
|
||||||
|
vx_tensor output_tensor = NULL;
|
||||||
|
|
||||||
|
inputs_tensor[0] = inputs[0]->t;
|
||||||
|
inputs_tensor[1] = inputs[1]->t;
|
||||||
|
inputs_tensor[2] = inputs[2]->t;
|
||||||
|
output_tensor = outputs[0]->t;
|
||||||
|
|
||||||
|
VSI_UNREFERENCED(output_num);
|
||||||
|
VSI_UNREFERENCED(kernel);
|
||||||
|
|
||||||
|
if (graph->ctx->config.support_ffd ||
|
||||||
|
graph->ctx->config.support_stream_processor)
|
||||||
|
{
|
||||||
|
node = vxInstanceNormalizationLayer(
|
||||||
|
graph->g,
|
||||||
|
eps,
|
||||||
|
inputs_tensor,
|
||||||
|
(vx_uint32)input_num,
|
||||||
|
output_tensor
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (vsi_nn_kernel_node_t)node;
|
||||||
|
} /* instance_norm() */
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -30,7 +30,7 @@
|
||||||
#include "vsi_nn_tensor_util.h"
|
#include "vsi_nn_tensor_util.h"
|
||||||
#include "kernel/vsi_nn_kernel.h"
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
|
||||||
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
|
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT)
|
||||||
#define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \
|
#define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \
|
||||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||||
( \
|
( \
|
||||||
|
|
@ -71,14 +71,20 @@ REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm )
|
||||||
inputs_tensor[2] = inputs[2]->t;
|
inputs_tensor[2] = inputs[2]->t;
|
||||||
output_tensor = outputs[0]->t;
|
output_tensor = outputs[0]->t;
|
||||||
|
|
||||||
node = vxLayerNormalizationLayer(
|
#if !defined(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) || !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
|
||||||
graph->g,
|
if (graph->ctx->config.support_ffd ||
|
||||||
eps,
|
graph->ctx->config.support_stream_processor)
|
||||||
axis,
|
#endif
|
||||||
inputs_tensor,
|
{
|
||||||
(uint32_t)input_num,
|
node = vxLayerNormalizationLayer(
|
||||||
output_tensor
|
graph->g,
|
||||||
|
eps,
|
||||||
|
axis,
|
||||||
|
inputs_tensor,
|
||||||
|
(uint32_t)input_num,
|
||||||
|
output_tensor
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return (vsi_nn_kernel_node_t)node;
|
return (vsi_nn_kernel_node_t)node;
|
||||||
} /* layer_norm() */
|
} /* layer_norm() */
|
||||||
|
|
|
||||||
|
|
@ -89,9 +89,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
|
||||||
if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
|
if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
|
||||||
{
|
{
|
||||||
vsi_nn_tensor_attr_t attr;
|
vsi_nn_tensor_attr_t attr;
|
||||||
|
|
||||||
memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
|
memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
|
||||||
memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
|
memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
|
||||||
attr.vtl = FALSE;
|
attr.vtl = TRUE;
|
||||||
attr.is_const = FALSE;
|
attr.is_const = FALSE;
|
||||||
|
|
||||||
convert_tensor = vsi_nn_CreateTensor(graph, &attr);
|
convert_tensor = vsi_nn_CreateTensor(graph, &attr);
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@
|
||||||
#include "vsi_nn_tensor_util.h"
|
#include "vsi_nn_tensor_util.h"
|
||||||
#include "kernel/vsi_nn_kernel.h"
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
|
||||||
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
#if (VX_RELATIONAL_OPS_VX_SUPPORT)
|
||||||
|
|
||||||
#define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \
|
#define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \
|
||||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||||
|
|
@ -68,12 +68,25 @@ REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops )
|
||||||
VSI_UNREFERENCED(kernel);
|
VSI_UNREFERENCED(kernel);
|
||||||
VSI_UNREFERENCED(output_num);
|
VSI_UNREFERENCED(output_num);
|
||||||
|
|
||||||
node = vxRelationalLayer(graph->g,
|
#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||||
operation,
|
if (vsi_nn_is_broadcast_operaton(inputs, input_num, outputs[0]))
|
||||||
inputs_tensor,
|
{
|
||||||
(uint32_t)input_num,
|
return NULL;
|
||||||
outputs[0]->t
|
}
|
||||||
);
|
#endif
|
||||||
|
|
||||||
|
#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||||
|
if (graph->ctx->config.support_stream_processor)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
node = vxRelationalLayer(
|
||||||
|
graph->g,
|
||||||
|
operation,
|
||||||
|
inputs_tensor,
|
||||||
|
(uint32_t)input_num,
|
||||||
|
outputs[0]->t
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return (vsi_nn_kernel_node_t)node;
|
return (vsi_nn_kernel_node_t)node;
|
||||||
} /* relational_ops() */
|
} /* relational_ops() */
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_tensor.h"
|
#include "vsi_nn_tensor.h"
|
||||||
#include "vsi_nn_node.h"
|
#include "vsi_nn_node.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
|
|
@ -66,7 +67,7 @@ REGISTER_SWISH_OPENVX_KERNEL( swish )
|
||||||
VSI_UNREFERENCED(output_num);
|
VSI_UNREFERENCED(output_num);
|
||||||
VSI_UNREFERENCED(input_num);
|
VSI_UNREFERENCED(input_num);
|
||||||
|
|
||||||
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
|
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
|
||||||
{
|
{
|
||||||
swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");
|
swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -67,8 +67,8 @@ __kernel void cumsum_F32toF32_axis2(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \
|
#define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||||
__kernel void cumsum_##name##toU8_axis2( \
|
__kernel void cumsum_##name##_axis2( \
|
||||||
__read_only image2d_array_t input, \
|
__read_only image2d_array_t input, \
|
||||||
__write_only image2d_array_t output, \
|
__write_only image2d_array_t output, \
|
||||||
int axis, \
|
int axis, \
|
||||||
|
|
@ -87,19 +87,19 @@ __kernel void cumsum_##name##toU8_axis2( \
|
||||||
int4 coord_out = coord; \
|
int4 coord_out = coord; \
|
||||||
\
|
\
|
||||||
src_type sum = (src_type)(0); \
|
src_type sum = (src_type)(0); \
|
||||||
uint4 dst = (uint4)(0); \
|
dst_type dst = (dst_type)(0); \
|
||||||
int tmp_zp = convert_int_rte(output_zp); \
|
int tmp_zp = convert_int_rte(output_zp); \
|
||||||
dst.x = convert_uint_sat(tmp_zp); \
|
dst.x = convert_dtype(tmp_zp); \
|
||||||
\
|
\
|
||||||
float cnt = 0.0f; \
|
float cnt = 0.0f; \
|
||||||
\
|
\
|
||||||
if(exclusive && rev) \
|
if(exclusive && rev) \
|
||||||
{ \
|
{ \
|
||||||
coord_out.z = channel - 1; \
|
coord_out.z = channel - 1; \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
for(coord.z = channel - 1; coord.z > 0; coord.z--) \
|
for(coord.z = channel - 1; coord.z > 0; coord.z--) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
coord_out.z--; \
|
coord_out.z--; \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
|
|
@ -107,17 +107,17 @@ __kernel void cumsum_##name##toU8_axis2( \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else if(exclusive) \
|
else if(exclusive) \
|
||||||
{ \
|
{ \
|
||||||
coord_out.z = 0; \
|
coord_out.z = 0; \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
for(coord.z = 0; coord.z < channel - 1; coord.z++) \
|
for(coord.z = 0; coord.z < channel - 1; coord.z++) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
coord_out.z++; \
|
coord_out.z++; \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
|
|
@ -125,45 +125,44 @@ __kernel void cumsum_##name##toU8_axis2( \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else if(rev) \
|
else if(rev) \
|
||||||
{ \
|
{ \
|
||||||
for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
|
for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
\
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord, dst); \
|
image_write(output, coord, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else \
|
else \
|
||||||
{ \
|
{ \
|
||||||
for(coord.z = 0; coord.z < channel; coord.z++) \
|
for(coord.z = 0; coord.z < channel; coord.z++) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
\
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord, dst); \
|
image_write(output, coord, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)
|
CUMSUM_toINT_AXIS2_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)
|
CUMSUM_toINT_AXIS2_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
|
CUMSUM_toINT_AXIS2_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||||
|
|
||||||
|
|
||||||
__kernel void cumsum_F32toF32_axis1(
|
__kernel void cumsum_F32toF32_axis1(
|
||||||
__read_only image2d_array_t input,
|
__read_only image2d_array_t input,
|
||||||
|
|
@ -233,10 +232,10 @@ __kernel void cumsum_F32toF32_axis1(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \
|
#define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||||
__kernel void cumsum_##name##toU8_axis1( \
|
__kernel void cumsum_##name##_axis1( \
|
||||||
__read_only image2d_array_t input, \
|
__read_only image2d_array_t input, \
|
||||||
__write_only image2d_array_t output, \
|
__write_only image2d_array_t output, \
|
||||||
int axis, \
|
int axis, \
|
||||||
int exclusive, \
|
int exclusive, \
|
||||||
int rev, \
|
int rev, \
|
||||||
|
|
@ -253,20 +252,20 @@ __kernel void cumsum_##name##toU8_axis1( \
|
||||||
int4 coord_out = coord; \
|
int4 coord_out = coord; \
|
||||||
\
|
\
|
||||||
src_type sum = (src_type)(0); \
|
src_type sum = (src_type)(0); \
|
||||||
uint4 dst = (uint4)(0); \
|
dst_type dst = (dst_type)(0); \
|
||||||
int tmp_zp = convert_int_rte(output_zp); \
|
int tmp_zp = convert_int_rte(output_zp); \
|
||||||
dst.x = convert_uint_sat(tmp_zp); \
|
dst.x = convert_dtype(tmp_zp); \
|
||||||
\
|
\
|
||||||
float cnt = 0; \
|
float cnt = 0; \
|
||||||
\
|
\
|
||||||
if(exclusive && rev) \
|
if(exclusive && rev) \
|
||||||
{ \
|
{ \
|
||||||
coord_out.y = height - 1; \
|
coord_out.y = height - 1; \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
\
|
\
|
||||||
for(coord.y = height - 1; coord.y > 0; coord.y--) \
|
for(coord.y = height - 1; coord.y > 0; coord.y--) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
coord_out.y--; \
|
coord_out.y--; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
|
|
@ -274,17 +273,17 @@ __kernel void cumsum_##name##toU8_axis1( \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else if(exclusive) \
|
else if(exclusive) \
|
||||||
{ \
|
{ \
|
||||||
coord_out.y = 0; \
|
coord_out.y = 0; \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
for(coord.y = 0; coord.y < height - 1; coord.y++) \
|
for(coord.y = 0; coord.y < height - 1; coord.y++) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
coord_out.y++; \
|
coord_out.y++; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
|
|
@ -292,44 +291,44 @@ __kernel void cumsum_##name##toU8_axis1( \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else if(rev) \
|
else if(rev) \
|
||||||
{ \
|
{ \
|
||||||
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
|
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
\
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord, dst); \
|
image_write(output, coord, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else \
|
else \
|
||||||
{ \
|
{ \
|
||||||
for(coord.y = 0; coord.y < height; coord.y++) \
|
for(coord.y = 0; coord.y < height; coord.y++) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
\
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord, dst); \
|
image_write(output, coord, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)
|
CUMSUM_toINT_AXIS1_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)
|
CUMSUM_toINT_AXIS1_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
|
CUMSUM_toINT_AXIS1_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||||
|
|
||||||
__kernel void cumsum_F32toF32_axis0(
|
__kernel void cumsum_F32toF32_axis0(
|
||||||
__read_only image2d_array_t input,
|
__read_only image2d_array_t input,
|
||||||
|
|
@ -399,8 +398,8 @@ __kernel void cumsum_F32toF32_axis0(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \
|
#define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||||
__kernel void cumsum_##name##toU8_axis0( \
|
__kernel void cumsum_##name##_axis0( \
|
||||||
__read_only image2d_array_t input, \
|
__read_only image2d_array_t input, \
|
||||||
__write_only image2d_array_t output, \
|
__write_only image2d_array_t output, \
|
||||||
int axis, \
|
int axis, \
|
||||||
|
|
@ -419,19 +418,19 @@ __kernel void cumsum_##name##toU8_axis0( \
|
||||||
int4 coord_out = coord; \
|
int4 coord_out = coord; \
|
||||||
\
|
\
|
||||||
src_type sum = (src_type)(0); \
|
src_type sum = (src_type)(0); \
|
||||||
uint4 dst = (uint4)(0); \
|
dst_type dst = (dst_type)(0); \
|
||||||
int tmp_zp = convert_int_rte(output_zp); \
|
int tmp_zp = convert_int_rte(output_zp); \
|
||||||
dst.x = convert_uint_sat(tmp_zp); \
|
dst.x = convert_dtype(tmp_zp); \
|
||||||
\
|
\
|
||||||
float cnt = 0; \
|
float cnt = 0; \
|
||||||
\
|
\
|
||||||
if(exclusive && rev) \
|
if(exclusive && rev) \
|
||||||
{ \
|
{ \
|
||||||
coord_out.x = width - 1; \
|
coord_out.x = width - 1; \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
for(coord.x = width - 1; coord.x > 0; coord.x--) \
|
for(coord.x = width - 1; coord.x > 0; coord.x--) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
coord_out.x--; \
|
coord_out.x--; \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
|
|
@ -439,8 +438,8 @@ __kernel void cumsum_##name##toU8_axis0( \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else if(exclusive) \
|
else if(exclusive) \
|
||||||
|
|
@ -449,7 +448,7 @@ __kernel void cumsum_##name##toU8_axis0( \
|
||||||
write_imageui(output, coord_out, dst); \
|
write_imageui(output, coord_out, dst); \
|
||||||
for(coord.x = 0; coord.x < width - 1; coord.x++) \
|
for(coord.x = 0; coord.x < width - 1; coord.x++) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
coord_out.x++; \
|
coord_out.x++; \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
|
|
@ -457,40 +456,42 @@ __kernel void cumsum_##name##toU8_axis0( \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord_out, dst); \
|
image_write(output, coord_out, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else if(rev) \
|
else if(rev) \
|
||||||
{ \
|
{ \
|
||||||
for(coord.x = width - 1; coord.x >= 0; coord.x--) \
|
for(coord.x = width - 1; coord.x >= 0; coord.x--) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
\
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord, dst); \
|
image_write(output, coord, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else \
|
else \
|
||||||
{ \
|
{ \
|
||||||
for(coord.x = 0; coord.x < width; coord.x++) \
|
for(coord.x = 0; coord.x < width; coord.x++) \
|
||||||
{ \
|
{ \
|
||||||
src_type data = read_image_type(input, coord); \
|
src_type data = image_read(input, coord); \
|
||||||
cnt += 1.0f; \
|
cnt += 1.0f; \
|
||||||
sum += data; \
|
sum += data; \
|
||||||
\
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
\
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord, dst); \
|
image_write(output, coord, dst); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)
|
CUMSUM_toINT_AXIS0_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)
|
CUMSUM_toINT_AXIS0_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
|
CUMSUM_toINT_AXIS0_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -65,188 +65,100 @@ __kernel void cumsum_F32toF32_axis1_2D(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void cumsum_U8toU8_axis1_2D(
|
#define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||||
__read_only image2d_t input,
|
__kernel void cumsum_##name##_axis1_2D( \
|
||||||
__write_only image2d_t output,
|
__read_only image2d_t input, \
|
||||||
int axis,
|
__write_only image2d_t output, \
|
||||||
int exclusive,
|
int axis, \
|
||||||
int rev,
|
int exclusive, \
|
||||||
int width,
|
int rev, \
|
||||||
int height,
|
int width, \
|
||||||
int chn,
|
int height, \
|
||||||
int input_zp,
|
int chn, \
|
||||||
float in_out_scale,
|
int input_zp, \
|
||||||
float in_out_zp_scale,
|
float in_out_scale, \
|
||||||
float output_zp
|
float in_out_zp_scale, \
|
||||||
)
|
float output_zp \
|
||||||
{
|
) \
|
||||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
{ \
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
|
||||||
uint4 sum = (uint4)(0);
|
\
|
||||||
uint4 dst = (uint4)(0);
|
src_type sum = (src_type)(0); \
|
||||||
|
dst_type dst = (dst_type)(0); \
|
||||||
int tmp_zp = convert_int_rte(output_zp);
|
int tmp_zp = convert_int_rte(output_zp); \
|
||||||
dst.x = convert_uint_sat(tmp_zp);
|
dst.x = convert_dtype(tmp_zp); \
|
||||||
|
\
|
||||||
float cnt = 0;
|
float cnt = 0; \
|
||||||
|
\
|
||||||
if(exclusive && rev)
|
if(exclusive && rev) \
|
||||||
{
|
{ \
|
||||||
coord.w = height - 1;
|
coord.w = height - 1; \
|
||||||
write_imageui(output, coord.zw, dst);
|
image_write(output, coord.zw, dst); \
|
||||||
for(coord.y = height - 1; coord.y > 0; coord.y--)
|
for(coord.y = height - 1; coord.y > 0; coord.y--) \
|
||||||
{
|
{ \
|
||||||
uint4 data = read_imageui(input, coord.xy);
|
src_type data = image_read(input, coord.xy); \
|
||||||
cnt += 1.0f;
|
cnt += 1.0f; \
|
||||||
coord.w--;
|
coord.w--; \
|
||||||
sum += data;
|
sum += data; \
|
||||||
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord.zw, dst);
|
image_write(output, coord.zw, dst); \
|
||||||
}
|
} \
|
||||||
}
|
} \
|
||||||
else if(exclusive)
|
else if(exclusive) \
|
||||||
{
|
{ \
|
||||||
write_imageui(output, coord.zw, dst);
|
image_write(output, coord.zw, dst); \
|
||||||
for(coord.y = 0; coord.y < height - 1; coord.y++)
|
for(coord.y = 0; coord.y < height - 1; coord.y++) \
|
||||||
{
|
{ \
|
||||||
uint4 data = read_imageui(input, coord.xy);
|
src_type data = image_read(input, coord.xy); \
|
||||||
cnt += 1.0f;
|
cnt += 1.0f; \
|
||||||
coord.w++;
|
coord.w++; \
|
||||||
sum += data;
|
sum += data; \
|
||||||
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord.zw, dst);
|
image_write(output, coord.zw, dst); \
|
||||||
}
|
} \
|
||||||
}
|
} \
|
||||||
else if(rev)
|
else if(rev) \
|
||||||
{
|
{ \
|
||||||
for(coord.y = height - 1; coord.y >= 0; coord.y--)
|
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
|
||||||
{
|
{ \
|
||||||
uint4 data = read_imageui(input, coord.xy);
|
src_type data = image_read(input, coord.xy); \
|
||||||
cnt += 1.0f;
|
cnt += 1.0f; \
|
||||||
sum += data;
|
sum += data; \
|
||||||
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord.xy, dst);
|
image_write(output, coord.xy, dst); \
|
||||||
}
|
} \
|
||||||
}
|
} \
|
||||||
else
|
else \
|
||||||
{
|
{ \
|
||||||
for(coord.y = 0; coord.y < height; coord.y++)
|
for(coord.y = 0; coord.y < height; coord.y++) \
|
||||||
{
|
{ \
|
||||||
uint4 data = read_imageui(input, coord.xy);
|
src_type data = image_read(input, coord.xy); \
|
||||||
cnt += 1.0f;
|
cnt += 1.0f; \
|
||||||
sum += data;
|
sum += data; \
|
||||||
|
\
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
|
\
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
dst.x = convert_dtype(tmpSum); \
|
||||||
write_imageui(output, coord.xy, dst);
|
image_write(output, coord.xy, dst); \
|
||||||
}
|
} \
|
||||||
}
|
} \
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void cumsum_F32toU8_axis1_2D(
|
|
||||||
__read_only image2d_t input,
|
|
||||||
__write_only image2d_t output,
|
|
||||||
int axis,
|
|
||||||
int exclusive,
|
|
||||||
int rev,
|
|
||||||
int width,
|
|
||||||
int height,
|
|
||||||
int chn,
|
|
||||||
int input_zp,
|
|
||||||
float in_out_scale,
|
|
||||||
float in_out_zp_scale,
|
|
||||||
float output_zp
|
|
||||||
)
|
|
||||||
{
|
|
||||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
|
||||||
|
|
||||||
float4 sum = (float4)(0);
|
|
||||||
uint4 dst = (uint4)(0);
|
|
||||||
int tmp_zp = convert_int_rte(output_zp);
|
|
||||||
dst.x = convert_uint_sat(tmp_zp);
|
|
||||||
|
|
||||||
float cnt = 0;
|
|
||||||
|
|
||||||
if(exclusive && rev)
|
|
||||||
{
|
|
||||||
coord.w = height - 1;
|
|
||||||
write_imageui(output, coord.zw, dst);
|
|
||||||
for(coord.y = height - 1; coord.y > 0; coord.y--)
|
|
||||||
{
|
|
||||||
float4 data = read_imagef(input, coord.xy);
|
|
||||||
cnt += 1.0f;
|
|
||||||
coord.w--;
|
|
||||||
sum += data;
|
|
||||||
|
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
|
||||||
|
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
|
||||||
write_imageui(output, coord.zw, dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if(exclusive)
|
|
||||||
{
|
|
||||||
write_imageui(output, coord.zw, dst);
|
|
||||||
for(coord.y = 0; coord.y < height - 1; coord.y++)
|
|
||||||
{
|
|
||||||
float4 data = read_imagef(input, coord.xy);
|
|
||||||
cnt += 1.0f;
|
|
||||||
coord.w++;
|
|
||||||
sum += data;
|
|
||||||
|
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
|
||||||
|
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
|
||||||
write_imageui(output, coord.zw, dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if(rev)
|
|
||||||
{
|
|
||||||
for(coord.y = height - 1; coord.y >= 0; coord.y--)
|
|
||||||
{
|
|
||||||
float4 data = read_imagef(input, coord.xy);
|
|
||||||
cnt += 1.0f;
|
|
||||||
sum += data;
|
|
||||||
|
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
|
||||||
|
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
|
||||||
write_imageui(output, coord.xy, dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for(coord.y = 0; coord.y < height; coord.y++)
|
|
||||||
{
|
|
||||||
float4 data = read_imagef(input, coord.xy);
|
|
||||||
cnt += 1.0f;
|
|
||||||
sum += data;
|
|
||||||
|
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
|
||||||
|
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
|
||||||
write_imageui(output, coord.xy, dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
CUMSUM_INT_AXIS1_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
|
CUMSUM_INT_AXIS1_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
|
CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||||
|
|
||||||
__kernel void cumsum_F32toF32_axis0_2D(
|
__kernel void cumsum_F32toF32_axis0_2D(
|
||||||
__read_only image2d_t input,
|
__read_only image2d_t input,
|
||||||
|
|
@ -316,188 +228,100 @@ __kernel void cumsum_F32toF32_axis0_2D(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void cumsum_U8toU8_axis0_2D(
|
#define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||||
__read_only image2d_t input,
|
__kernel void cumsum_##name##_axis0_2D( \
|
||||||
__write_only image2d_t output,
|
__read_only image2d_t input, \
|
||||||
int axis,
|
__write_only image2d_t output, \
|
||||||
int exclusive,
|
int axis, \
|
||||||
int rev,
|
int exclusive, \
|
||||||
int width,
|
int rev, \
|
||||||
int height,
|
int width, \
|
||||||
int chn,
|
int height, \
|
||||||
int input_zp,
|
int chn, \
|
||||||
float in_out_scale,
|
int input_zp, \
|
||||||
float in_out_zp_scale,
|
float in_out_scale, \
|
||||||
float output_zp
|
float in_out_zp_scale, \
|
||||||
)
|
float output_zp \
|
||||||
{
|
) \
|
||||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
{ \
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
|
||||||
uint4 sum = (uint4)(0);
|
\
|
||||||
uint4 dst = (uint4)(0);
|
src_type sum = (src_type)(0); \
|
||||||
|
dst_type dst = (dst_type)(0); \
|
||||||
int tmp_zp = convert_int_rte(output_zp);
|
\
|
||||||
dst.x = convert_uint_sat(tmp_zp);
|
int tmp_zp = convert_int_rte(output_zp); \
|
||||||
|
dst.x = convert_dtype(tmp_zp); \
|
||||||
float cnt = 0.0f;
|
\
|
||||||
|
float cnt = 0.0f; \
|
||||||
if(exclusive && rev)
|
\
|
||||||
{
|
if(exclusive && rev) \
|
||||||
coord.x = width - 1;
|
{ \
|
||||||
coord.z = coord.x;
|
coord.x = width - 1; \
|
||||||
write_imageui(output, coord.zw, dst);
|
coord.z = coord.x; \
|
||||||
for(; coord.x > 0; coord.x--)
|
image_write(output, coord.zw, dst); \
|
||||||
{
|
for(; coord.x > 0; coord.x--) \
|
||||||
uint4 data = read_imageui(input, coord.xy);
|
{ \
|
||||||
coord.z--;
|
src_type data = image_read(input, coord.xy); \
|
||||||
cnt += 1.0;
|
coord.z--; \
|
||||||
sum += data;
|
cnt += 1.0; \
|
||||||
|
sum += data; \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
\
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
\
|
||||||
write_imageui(output, coord.zw, dst);
|
dst.x = convert_dtype(tmpSum); \
|
||||||
}
|
image_write(output, coord.zw, dst); \
|
||||||
}
|
} \
|
||||||
else if(exclusive)
|
} \
|
||||||
{
|
else if(exclusive) \
|
||||||
coord.z = 0;
|
{ \
|
||||||
write_imageui(output, coord.zw, dst);
|
coord.z = 0; \
|
||||||
for(coord.x = 0; coord.x < width - 1; coord.x++)
|
image_write(output, coord.zw, dst); \
|
||||||
{
|
for(coord.x = 0; coord.x < width - 1; coord.x++) \
|
||||||
uint4 data = read_imageui(input, coord.xy);
|
{ \
|
||||||
cnt += 1.0f;
|
src_type data = image_read(input, coord.xy); \
|
||||||
coord.z++;
|
cnt += 1.0f; \
|
||||||
sum += data;
|
coord.z++; \
|
||||||
|
sum += data; \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
\
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
\
|
||||||
write_imageui(output, coord.zw, dst);
|
dst.x = convert_dtype(tmpSum); \
|
||||||
}
|
image_write(output, coord.zw, dst); \
|
||||||
}
|
} \
|
||||||
else if(rev)
|
} \
|
||||||
{
|
else if(rev) \
|
||||||
for(coord.x = width - 1; coord.x >= 0; coord.x--)
|
{ \
|
||||||
{
|
for(coord.x = width - 1; coord.x >= 0; coord.x--) \
|
||||||
uint4 data = read_imageui(input, coord.xy);
|
{ \
|
||||||
cnt += 1.0f;
|
src_type data = image_read(input, coord.xy); \
|
||||||
sum += data;
|
cnt += 1.0f; \
|
||||||
|
sum += data; \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
\
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
\
|
||||||
write_imageui(output, coord.xy, dst);
|
dst.x = convert_dtype(tmpSum); \
|
||||||
}
|
image_write(output, coord.xy, dst); \
|
||||||
}
|
} \
|
||||||
else
|
} \
|
||||||
{
|
else \
|
||||||
for(coord.x = 0; coord.x < width; coord.x++)
|
{ \
|
||||||
{
|
for(coord.x = 0; coord.x < width; coord.x++) \
|
||||||
uint4 data = read_imageui(input, coord.xy);
|
{ \
|
||||||
cnt += 1.0f;
|
src_type data = image_read(input, coord.xy); \
|
||||||
sum += data;
|
cnt += 1.0f; \
|
||||||
|
sum += data; \
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
\
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||||
|
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
\
|
||||||
write_imageui(output, coord.xy, dst);
|
dst.x = convert_dtype(tmpSum); \
|
||||||
}
|
image_write(output, coord.xy, dst); \
|
||||||
}
|
} \
|
||||||
}
|
} \
|
||||||
|
|
||||||
__kernel void cumsum_F32toU8_axis0_2D(
|
|
||||||
__read_only image2d_t input,
|
|
||||||
__write_only image2d_t output,
|
|
||||||
int axis,
|
|
||||||
int exclusive,
|
|
||||||
int rev,
|
|
||||||
int width,
|
|
||||||
int height,
|
|
||||||
int chn,
|
|
||||||
int input_zp,
|
|
||||||
float in_out_scale,
|
|
||||||
float in_out_zp_scale,
|
|
||||||
float output_zp
|
|
||||||
)
|
|
||||||
{
|
|
||||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
|
||||||
|
|
||||||
float4 sum = (float4)(0);
|
|
||||||
uint4 dst = (uint4)(0);
|
|
||||||
int tmp_zp = convert_int_rte(output_zp);
|
|
||||||
dst.x = convert_uint_sat(tmp_zp);
|
|
||||||
|
|
||||||
float cnt = 0.0f;
|
|
||||||
if(exclusive && rev)
|
|
||||||
{
|
|
||||||
coord.x = width - 1;
|
|
||||||
coord.z = coord.x;
|
|
||||||
write_imageui(output, coord.zw, dst);
|
|
||||||
for(; coord.x > 0; coord.x--)
|
|
||||||
{
|
|
||||||
float4 data = read_imagef(input, coord.xy);
|
|
||||||
coord.z--;
|
|
||||||
cnt += 1.0;
|
|
||||||
sum += data;
|
|
||||||
|
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
|
||||||
|
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
|
||||||
write_imageui(output, coord.zw, dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if(exclusive)
|
|
||||||
{
|
|
||||||
coord.z = 0;
|
|
||||||
write_imageui(output, coord.zw, dst);
|
|
||||||
for(coord.x = 0; coord.x < width - 1; coord.x++)
|
|
||||||
{
|
|
||||||
float4 data = read_imagef(input, coord.xy);
|
|
||||||
cnt += 1.0f;
|
|
||||||
coord.z++;
|
|
||||||
sum += data;
|
|
||||||
|
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
|
||||||
|
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
|
||||||
write_imageui(output, coord.zw, dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if(rev)
|
|
||||||
{
|
|
||||||
for(coord.x = width - 1; coord.x >= 0; coord.x--)
|
|
||||||
{
|
|
||||||
float4 data = read_imagef(input, coord.xy);
|
|
||||||
cnt += 1.0f;
|
|
||||||
sum += data;
|
|
||||||
|
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
|
||||||
|
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
|
||||||
write_imageui(output, coord.xy, dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for(coord.x = 0; coord.x < width; coord.x++)
|
|
||||||
{
|
|
||||||
float4 data = read_imagef(input, coord.xy);
|
|
||||||
cnt += 1.0f;
|
|
||||||
sum += data;
|
|
||||||
|
|
||||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
|
||||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
|
||||||
|
|
||||||
dst.x = (uint)convert_int_rte(tmpSum);
|
|
||||||
write_imageui(output, coord.xy, dst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
CUMSUM_INT_AXIS0_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
|
CUMSUM_INT_AXIS0_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||||
|
CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||||
|
|
|
||||||
|
|
@ -132,3 +132,30 @@ __kernel void one_hot_U8toU8
|
||||||
coord.z ++;
|
coord.z ++;
|
||||||
} while (coord.z < depth);
|
} while (coord.z < depth);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__kernel void one_hot_I32toBF16
|
||||||
|
(
|
||||||
|
__read_only image2d_t input,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int depth,
|
||||||
|
uint on_value,
|
||||||
|
uint off_value,
|
||||||
|
float inputScale,
|
||||||
|
float inputTail
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||||
|
|
||||||
|
int4 src = read_imagei(input, coord.xy);
|
||||||
|
|
||||||
|
int val = convert_int(convert_float(src.x) * inputScale - inputTail);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
uint4 dst;
|
||||||
|
dst.x = val == coord.z ? on_value : off_value;
|
||||||
|
|
||||||
|
write_imageui(output, coord.xzyw, dst.xxxx);
|
||||||
|
|
||||||
|
coord.z ++;
|
||||||
|
} while (coord.z < depth);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,373 @@
|
||||||
|
__kernel void rope_F32_F32toF32_axis0
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis,
|
||||||
|
float input_zp,
|
||||||
|
float cos_zp,
|
||||||
|
float sin_zp,
|
||||||
|
float scale0,
|
||||||
|
float scale1,
|
||||||
|
float output_zp,
|
||||||
|
int half_head_size,
|
||||||
|
int step
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
float4 cos, sin;
|
||||||
|
|
||||||
|
READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
|
||||||
|
READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
|
||||||
|
coord.x = coord.x * step;
|
||||||
|
float4 src0 = read_imagef(input, coord);
|
||||||
|
int4 coord_out = coord;
|
||||||
|
|
||||||
|
coord.x += half_head_size;
|
||||||
|
float4 src1 = read_imagef(input, coord);
|
||||||
|
|
||||||
|
float4 dst0 = src0 * cos - src1 * sin;
|
||||||
|
float4 dst1 = src0 * sin + src1 * cos;
|
||||||
|
|
||||||
|
write_imagef(output, coord_out, dst0);
|
||||||
|
coord_out.x += half_head_size;
|
||||||
|
write_imagef(output, coord_out, dst1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void rope_F32_F32toF32_axis1
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis,
|
||||||
|
float input_zp,
|
||||||
|
float cos_zp,
|
||||||
|
float sin_zp,
|
||||||
|
float scale0,
|
||||||
|
float scale1,
|
||||||
|
float output_zp,
|
||||||
|
int half_head_size,
|
||||||
|
int step
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
float4 cos, sin;
|
||||||
|
|
||||||
|
READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
|
||||||
|
READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
|
||||||
|
coord.y = coord.y * step;
|
||||||
|
float4 src0 = read_imagef(input, coord);
|
||||||
|
int4 coord_out = coord;
|
||||||
|
coord.y += half_head_size;
|
||||||
|
float4 src1 = read_imagef(input, coord);
|
||||||
|
|
||||||
|
float4 dst0 = src0 * cos - src1 * sin;
|
||||||
|
float4 dst1 = src0 * sin + src1 * cos;
|
||||||
|
|
||||||
|
write_imagef(output, coord_out, dst0);
|
||||||
|
coord_out.y += half_head_size;
|
||||||
|
write_imagef(output, coord_out, dst1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void rope_F32_F32toF32_axis2
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis,
|
||||||
|
float input_zp,
|
||||||
|
float cos_zp,
|
||||||
|
float sin_zp,
|
||||||
|
float scale0,
|
||||||
|
float scale1,
|
||||||
|
float output_zp,
|
||||||
|
int half_head_size,
|
||||||
|
int step
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
|
||||||
|
float4 cos = read_imagef(cos_cache, coord);
|
||||||
|
float4 sin = read_imagef(sin_cache, coord);
|
||||||
|
coord.z = coord.z * step;
|
||||||
|
float4 src0 = read_imagef(input, coord);
|
||||||
|
int4 coord_out = coord;
|
||||||
|
coord.z += half_head_size;
|
||||||
|
float4 src1 = read_imagef(input, coord);
|
||||||
|
|
||||||
|
float4 dst0 = src0 * cos - src1 * sin;
|
||||||
|
float4 dst1 = src0 * sin + src1 * cos;
|
||||||
|
|
||||||
|
write_imagef(output, coord_out, dst0);
|
||||||
|
coord_out.z += half_head_size;
|
||||||
|
write_imagef(output, coord_out, dst1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void rope_I32_I32toI32_axis0
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis,
|
||||||
|
float input_zp,
|
||||||
|
float cos_zp,
|
||||||
|
float sin_zp,
|
||||||
|
float scale0,
|
||||||
|
float scale1,
|
||||||
|
float output_zp,
|
||||||
|
int half_head_size,
|
||||||
|
int step
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
int4 _cos, _sin;
|
||||||
|
float4 cos, sin;
|
||||||
|
|
||||||
|
READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
|
||||||
|
READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
|
||||||
|
coord.x = coord.x * step;
|
||||||
|
float4 src0 = convert_float4(read_imagei(input, coord));
|
||||||
|
int4 coord_out = coord;
|
||||||
|
|
||||||
|
coord.x += half_head_size;
|
||||||
|
float4 src1 = convert_float4(read_imagei(input, coord));
|
||||||
|
|
||||||
|
src0 = src0 - input_zp;
|
||||||
|
src1 = src1 - input_zp;
|
||||||
|
cos = convert_float4(_cos) - cos_zp;
|
||||||
|
sin = convert_float4(_sin) - sin_zp;
|
||||||
|
|
||||||
|
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||||
|
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||||
|
int4 dst0 = convert_int4_rte(_dst0);
|
||||||
|
int4 dst1 = convert_int4_rte(_dst1);
|
||||||
|
|
||||||
|
write_imagei(output, coord_out, dst0);
|
||||||
|
coord_out.x += half_head_size;
|
||||||
|
write_imagei(output, coord_out, dst1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void rope_I32_I32toI32_axis1
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis,
|
||||||
|
float input_zp,
|
||||||
|
float cos_zp,
|
||||||
|
float sin_zp,
|
||||||
|
float scale0,
|
||||||
|
float scale1,
|
||||||
|
float output_zp,
|
||||||
|
int half_head_size,
|
||||||
|
int step
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
int4 _cos, _sin;
|
||||||
|
float4 cos, sin;
|
||||||
|
|
||||||
|
READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
|
||||||
|
READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
|
||||||
|
coord.y = coord.y * step;
|
||||||
|
float4 src0 = convert_float4(read_imagei(input, coord));
|
||||||
|
int4 coord_out = coord;
|
||||||
|
|
||||||
|
coord.y += half_head_size;
|
||||||
|
float4 src1 = convert_float4(read_imagei(input, coord));
|
||||||
|
|
||||||
|
src0 = src0 - input_zp;
|
||||||
|
src1 = src1 - input_zp;
|
||||||
|
cos = convert_float4(_cos) - cos_zp;
|
||||||
|
sin = convert_float4(_sin) - sin_zp;
|
||||||
|
|
||||||
|
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||||
|
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||||
|
int4 dst0 = convert_int4_rte(_dst0);
|
||||||
|
int4 dst1 = convert_int4_rte(_dst1);
|
||||||
|
|
||||||
|
write_imagei(output, coord_out, dst0);
|
||||||
|
coord_out.y += half_head_size;
|
||||||
|
write_imagei(output, coord_out, dst1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void rope_I32_I32toI32_axis2
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis,
|
||||||
|
float input_zp,
|
||||||
|
float cos_zp,
|
||||||
|
float sin_zp,
|
||||||
|
float scale0,
|
||||||
|
float scale1,
|
||||||
|
float output_zp,
|
||||||
|
int half_head_size,
|
||||||
|
int step
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
|
||||||
|
float4 cos = convert_float4(read_imagei(cos_cache, coord));
|
||||||
|
float4 sin = convert_float4(read_imagei(sin_cache, coord));
|
||||||
|
coord.z = coord.z * step;
|
||||||
|
float4 src0 = convert_float4(read_imagei(input, coord));
|
||||||
|
int4 coord_out = coord;
|
||||||
|
|
||||||
|
coord.z += half_head_size;
|
||||||
|
float4 src1 = convert_float4(read_imagei(input, coord));
|
||||||
|
|
||||||
|
src0 = src0 - input_zp;
|
||||||
|
src1 = src1 - input_zp;
|
||||||
|
cos = cos - cos_zp;
|
||||||
|
sin = sin - sin_zp;
|
||||||
|
|
||||||
|
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||||
|
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||||
|
int4 dst0 = convert_int4_rte(_dst0);
|
||||||
|
int4 dst1 = convert_int4_rte(_dst1);
|
||||||
|
|
||||||
|
write_imagei(output, coord_out, dst0);
|
||||||
|
coord_out.z += half_head_size;
|
||||||
|
write_imagei(output, coord_out, dst1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void rope_U32_U32toU32_axis0
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis,
|
||||||
|
float input_zp,
|
||||||
|
float cos_zp,
|
||||||
|
float sin_zp,
|
||||||
|
float scale0,
|
||||||
|
float scale1,
|
||||||
|
float output_zp,
|
||||||
|
int half_head_size,
|
||||||
|
int step
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
uint4 _cos, _sin;
|
||||||
|
float4 cos, sin;
|
||||||
|
|
||||||
|
READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
|
||||||
|
READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
|
||||||
|
coord.x = coord.x * step;
|
||||||
|
float4 src0 = convert_float4(read_imageui(input, coord));
|
||||||
|
int4 coord_out = coord;
|
||||||
|
|
||||||
|
coord.x += half_head_size;
|
||||||
|
float4 src1 = convert_float4(read_imageui(input, coord));
|
||||||
|
|
||||||
|
src0 = src0 - input_zp;
|
||||||
|
src1 = src1 - input_zp;
|
||||||
|
cos = convert_float4(_cos) - cos_zp;
|
||||||
|
sin = convert_float4(_sin) - sin_zp;
|
||||||
|
|
||||||
|
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||||
|
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||||
|
uint4 dst0 = convert_uint4_rte(_dst0);
|
||||||
|
uint4 dst1 = convert_uint4_rte(_dst1);
|
||||||
|
|
||||||
|
write_imageui(output, coord_out, dst0);
|
||||||
|
coord_out.x += half_head_size;
|
||||||
|
write_imageui(output, coord_out, dst1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void rope_U32_U32toU32_axis1
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis,
|
||||||
|
float input_zp,
|
||||||
|
float cos_zp,
|
||||||
|
float sin_zp,
|
||||||
|
float scale0,
|
||||||
|
float scale1,
|
||||||
|
float output_zp,
|
||||||
|
int half_head_size,
|
||||||
|
int step
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
uint4 _cos, _sin;
|
||||||
|
float4 cos, sin;
|
||||||
|
|
||||||
|
READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
|
||||||
|
READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
|
||||||
|
coord.y = coord.y * step;
|
||||||
|
float4 src0 = convert_float4(read_imageui(input, coord));
|
||||||
|
int4 coord_out = coord;
|
||||||
|
|
||||||
|
coord.y += half_head_size;
|
||||||
|
float4 src1 = convert_float4(read_imageui(input, coord));
|
||||||
|
|
||||||
|
src0 = src0 - input_zp;
|
||||||
|
src1 = src1 - input_zp;
|
||||||
|
cos = convert_float4(_cos) - cos_zp;
|
||||||
|
sin = convert_float4(_sin) - sin_zp;
|
||||||
|
|
||||||
|
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||||
|
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||||
|
uint4 dst0 = convert_uint4_rte(_dst0);
|
||||||
|
uint4 dst1 = convert_uint4_rte(_dst1);
|
||||||
|
|
||||||
|
write_imageui(output, coord_out, dst0);
|
||||||
|
coord_out.y += half_head_size;
|
||||||
|
write_imageui(output, coord_out, dst1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void rope_U32_U32toU32_axis2
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis,
|
||||||
|
float input_zp,
|
||||||
|
float cos_zp,
|
||||||
|
float sin_zp,
|
||||||
|
float scale0,
|
||||||
|
float scale1,
|
||||||
|
float output_zp,
|
||||||
|
int half_head_size,
|
||||||
|
int step
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
|
||||||
|
float4 cos = convert_float4(read_imageui(cos_cache, coord));
|
||||||
|
float4 sin = convert_float4(read_imageui(sin_cache, coord));
|
||||||
|
coord.z = coord.z * step;
|
||||||
|
float4 src0 = convert_float4(read_imageui(input, coord));
|
||||||
|
int4 coord_out = coord;
|
||||||
|
|
||||||
|
coord.z += half_head_size;
|
||||||
|
float4 src1 = convert_float4(read_imageui(input, coord));
|
||||||
|
|
||||||
|
src0 = src0 - input_zp;
|
||||||
|
src1 = src1 - input_zp;
|
||||||
|
cos = cos - cos_zp;
|
||||||
|
sin = sin - sin_zp;
|
||||||
|
|
||||||
|
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||||
|
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||||
|
uint4 dst0 = convert_uint4_rte(_dst0);
|
||||||
|
uint4 dst1 = convert_uint4_rte(_dst1);
|
||||||
|
|
||||||
|
write_imageui(output, coord_out, dst0);
|
||||||
|
coord_out.z += half_head_size;
|
||||||
|
write_imageui(output, coord_out, dst1);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,307 @@
|
||||||
|
#include "cl_viv_vx_ext.h"
|
||||||
|
|
||||||
|
_viv_uniform int top;
|
||||||
|
_viv_uniform int left;
|
||||||
|
_viv_uniform float out_scale_r;
|
||||||
|
_viv_uniform float out_scale_g;
|
||||||
|
_viv_uniform float out_scale_b;
|
||||||
|
_viv_uniform float out_zp_r;
|
||||||
|
_viv_uniform float out_zp_g;
|
||||||
|
_viv_uniform float out_zp_b;
|
||||||
|
_viv_uniform float pad_v_r;
|
||||||
|
_viv_uniform float pad_v_g;
|
||||||
|
_viv_uniform float pad_v_b;
|
||||||
|
_viv_uniform float scale_w;
|
||||||
|
_viv_uniform float scale_h;
|
||||||
|
_viv_uniform int resize_max_w;
|
||||||
|
_viv_uniform int resize_max_h;
|
||||||
|
_viv_uniform int out_height;
|
||||||
|
_viv_uniform int r_order;
|
||||||
|
_viv_uniform int b_order;
|
||||||
|
_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniLeftToFloat32_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||||
|
|
||||||
|
__kernel void custom_letterbox_U8toU8
|
||||||
|
(
|
||||||
|
__read_only image2d_t input,
|
||||||
|
__write_only image2d_t output,
|
||||||
|
int top_,
|
||||||
|
int bottom_,
|
||||||
|
int left_,
|
||||||
|
int right_,
|
||||||
|
float mean_r_,
|
||||||
|
float mean_g_,
|
||||||
|
float mean_b_,
|
||||||
|
float scale_r_,
|
||||||
|
float scale_g_,
|
||||||
|
float scale_b_,
|
||||||
|
int pad_r_,
|
||||||
|
int pad_g_,
|
||||||
|
int pad_b_,
|
||||||
|
int reverse_channel
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
|
||||||
|
int2 coord = coord_out;
|
||||||
|
uint4 dst = (uint4)(0,0,0,0);
|
||||||
|
vxc_uchar8 result;
|
||||||
|
|
||||||
|
if (coord_out.x < left || coord_out.x >= resize_max_w ||
|
||||||
|
coord_out.y < top || coord_out.y >= resize_max_h)
|
||||||
|
{
|
||||||
|
dst.x = convert_uint(pad_v_r);
|
||||||
|
coord.y = coord_out.y + r_order;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
dst.x = convert_uint(pad_v_g);
|
||||||
|
coord.y = coord_out.y + out_height;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
dst.x = convert_uint(pad_v_b);
|
||||||
|
coord.y = coord_out.y + b_order;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float in_x = convert_float(coord_out.x - left) * scale_w;
|
||||||
|
float in_y = convert_float(coord_out.y - top) * scale_h;
|
||||||
|
float left_x_f = floor(in_x);
|
||||||
|
float top_y_f = floor(in_y);
|
||||||
|
float x_lerp = in_x - left_x_f;
|
||||||
|
float y_lerp = in_y - top_y_f;
|
||||||
|
int left_x_idx = convert_int(left_x_f);
|
||||||
|
int top_y_idx = convert_int(top_y_f);
|
||||||
|
|
||||||
|
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
|
||||||
|
vxc_uchar8 top_data, bottom_data;
|
||||||
|
|
||||||
|
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||||
|
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
|
||||||
|
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
float4 left4 = (float4)(0,0,0,0);
|
||||||
|
float4 right4 = (float4)(0,0,0,0);
|
||||||
|
float4 top4 = (float4)(0,0,0,0);
|
||||||
|
float4 bottom4 = (float4)(0,0,0,0);
|
||||||
|
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||||
|
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||||
|
top4 = right4 * x_lerp + left4;
|
||||||
|
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||||
|
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||||
|
bottom4 = right4 * x_lerp + left4;
|
||||||
|
float4 out = (bottom4 - top4) * y_lerp + top4;
|
||||||
|
|
||||||
|
dst.x = convert_uint(out.x * out_scale_r + out_zp_r );
|
||||||
|
coord.y = coord_out.y + r_order;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
dst.x = convert_uint(out.y * out_scale_g + out_zp_g);
|
||||||
|
coord.y = coord_out.y + out_height;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
dst.x = convert_uint(out.z * out_scale_b + out_zp_b);
|
||||||
|
coord.y = coord_out.y + b_order;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void custom_letterbox_U8toI8
|
||||||
|
(
|
||||||
|
__read_only image2d_t input,
|
||||||
|
__write_only image2d_t output,
|
||||||
|
int top_,
|
||||||
|
int bottom_,
|
||||||
|
int left_,
|
||||||
|
int right_,
|
||||||
|
float mean_r_,
|
||||||
|
float mean_g_,
|
||||||
|
float mean_b_,
|
||||||
|
float scale_r_,
|
||||||
|
float scale_g_,
|
||||||
|
float scale_b_,
|
||||||
|
int pad_r_,
|
||||||
|
int pad_g_,
|
||||||
|
int pad_b_,
|
||||||
|
int reverse_channel
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
|
||||||
|
int2 coord = coord_out;
|
||||||
|
int4 dst = (int4)(0,0,0,0);
|
||||||
|
vxc_char8 result;
|
||||||
|
|
||||||
|
if (coord_out.x < left || coord_out.x >= resize_max_w ||
|
||||||
|
coord_out.y < top || coord_out.y >= resize_max_h)
|
||||||
|
{
|
||||||
|
dst.x = convert_int(pad_v_r);
|
||||||
|
coord.y = coord_out.y + r_order;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
dst.x = convert_int(pad_v_g);
|
||||||
|
coord.y = coord_out.y + out_height;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
dst.x = convert_int(pad_v_b);
|
||||||
|
coord.y = coord_out.y + b_order;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float in_x = convert_float(coord_out.x - left) * scale_w;
|
||||||
|
float in_y = convert_float(coord_out.y - top) * scale_h;
|
||||||
|
float left_x_f = floor(in_x);
|
||||||
|
float top_y_f = floor(in_y);
|
||||||
|
float x_lerp = in_x - left_x_f;
|
||||||
|
float y_lerp = in_y - top_y_f;
|
||||||
|
int left_x_idx = convert_int(left_x_f);
|
||||||
|
int top_y_idx = convert_int(top_y_f);
|
||||||
|
|
||||||
|
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
|
||||||
|
vxc_char8 top_data, bottom_data;
|
||||||
|
|
||||||
|
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||||
|
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
|
||||||
|
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
float4 left4 = (float4)(0,0,0,0);
|
||||||
|
float4 right4 = (float4)(0,0,0,0);
|
||||||
|
float4 top4 = (float4)(0,0,0,0);
|
||||||
|
float4 bottom4 = (float4)(0,0,0,0);
|
||||||
|
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||||
|
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||||
|
top4 = right4 * x_lerp + left4;
|
||||||
|
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||||
|
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||||
|
bottom4 = right4 * x_lerp + left4;
|
||||||
|
float4 out = (bottom4 - top4) * y_lerp + top4;
|
||||||
|
|
||||||
|
dst.x = convert_int(out.x * out_scale_r + out_zp_r);
|
||||||
|
coord.y = coord_out.y + r_order;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
dst.x = convert_int(out.y * out_scale_g + out_zp_g);
|
||||||
|
coord.y = coord_out.y + out_height;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
dst.x = convert_int(out.z * out_scale_b + out_zp_b);
|
||||||
|
coord.y = coord_out.y + b_order;
|
||||||
|
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||||
|
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void custom_letterbox_U8toF16
|
||||||
|
(
|
||||||
|
__read_only image2d_t input,
|
||||||
|
__write_only image2d_t output,
|
||||||
|
int top_,
|
||||||
|
int bottom_,
|
||||||
|
int left_,
|
||||||
|
int right_,
|
||||||
|
float mean_r_,
|
||||||
|
float mean_g_,
|
||||||
|
float mean_b_,
|
||||||
|
float scale_r_,
|
||||||
|
float scale_g_,
|
||||||
|
float scale_b_,
|
||||||
|
int pad_r_,
|
||||||
|
int pad_g_,
|
||||||
|
int pad_b_,
|
||||||
|
int reverse_channel
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
|
||||||
|
int2 coord = coord_out;
|
||||||
|
half4 tmp;
|
||||||
|
vxc_half8 dst_temp;
|
||||||
|
vxc_ushort8 dst;
|
||||||
|
|
||||||
|
if (coord_out.x < left || coord_out.x >= resize_max_w ||
|
||||||
|
coord_out.y < top || coord_out.y >= resize_max_h)
|
||||||
|
{
|
||||||
|
|
||||||
|
float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0);
|
||||||
|
_viv_asm(CONV, tmp, pad);
|
||||||
|
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst_temp, 16);
|
||||||
|
coord.y = coord_out.y + r_order;
|
||||||
|
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
tmp.x = tmp.y;
|
||||||
|
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst_temp, 16);
|
||||||
|
coord.y = coord_out.y + out_height;
|
||||||
|
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
tmp.x = tmp.z;
|
||||||
|
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst_temp, 16);
|
||||||
|
coord.y = coord_out.y + b_order;
|
||||||
|
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float in_x = convert_float(coord_out.x - left) * scale_w;
|
||||||
|
float in_y = convert_float(coord_out.y - top) * scale_h;
|
||||||
|
float left_x_f = floor(in_x);
|
||||||
|
float top_y_f = floor(in_y);
|
||||||
|
float x_lerp = in_x - left_x_f;
|
||||||
|
float y_lerp = in_y - top_y_f;
|
||||||
|
int left_x_idx = convert_int(left_x_f);
|
||||||
|
int top_y_idx = convert_int(top_y_f);
|
||||||
|
|
||||||
|
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
|
||||||
|
vxc_uchar8 top_data, bottom_data;
|
||||||
|
|
||||||
|
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||||
|
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
|
||||||
|
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
float4 left4 = (float4)(0,0,0,0);
|
||||||
|
float4 right4 = (float4)(0,0,0,0);
|
||||||
|
float4 top4 = (float4)(0,0,0,0);
|
||||||
|
float4 bottom4 = (float4)(0,0,0,0);
|
||||||
|
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||||
|
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||||
|
top4 = right4 * x_lerp + left4;
|
||||||
|
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||||
|
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||||
|
bottom4 = right4 * x_lerp + left4;
|
||||||
|
float4 out = (bottom4 - top4) * y_lerp + top4;
|
||||||
|
|
||||||
|
float4 out_temp = (float4)(0,0,0,0);
|
||||||
|
out_temp.x = out.x * out_scale_r + out_zp_r;
|
||||||
|
_viv_asm(CONV, tmp, out_temp);
|
||||||
|
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst_temp, 16);
|
||||||
|
coord.y = coord_out.y + r_order;
|
||||||
|
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
out_temp.x = out.y * out_scale_g + out_zp_g;
|
||||||
|
_viv_asm(CONV, tmp, out_temp);
|
||||||
|
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst_temp, 16);
|
||||||
|
coord.y = coord_out.y + out_height;
|
||||||
|
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
out_temp.x = out.z * out_scale_b + out_zp_b;
|
||||||
|
_viv_asm(CONV, tmp, out_temp);
|
||||||
|
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst_temp, 16);
|
||||||
|
coord.y = coord_out.y + out_height;
|
||||||
|
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
@ -10,7 +10,12 @@
|
||||||
#include "cl_viv_vx_ext.h"
|
#include "cl_viv_vx_ext.h"
|
||||||
|
|
||||||
_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;
|
_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;
|
||||||
|
_viv_uniform VXC_512Bits uniExtract8Bin_2x8;
|
||||||
_viv_uniform int sf_size;
|
_viv_uniform int sf_size;
|
||||||
|
_viv_uniform float srcScale;
|
||||||
|
_viv_uniform float srcZP;
|
||||||
|
_viv_uniform float dstScale;
|
||||||
|
_viv_uniform float dstZP;
|
||||||
#define F_MAX(a,b) ((a)>(b)?(a):(b))
|
#define F_MAX(a,b) ((a)>(b)?(a):(b))
|
||||||
__kernel void Softmax2VXC
|
__kernel void Softmax2VXC
|
||||||
(
|
(
|
||||||
|
|
@ -19,35 +24,37 @@ __kernel void Softmax2VXC
|
||||||
int axis
|
int axis
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
int4 coord_in = (int4)(0,0,0,0);
|
int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
|
||||||
float fMax = 0.0;
|
float fMax = 0;
|
||||||
for (int i = 0; i < sf_size; i++)
|
for (int i = 0; i < sf_size; i++)
|
||||||
{
|
{
|
||||||
vxc_char8 val;
|
vxc_short8 val;
|
||||||
|
vxc_half8 val_h;
|
||||||
coord_in.x = i;
|
coord_in.x = i;
|
||||||
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
|
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, val_h, val, 16);
|
||||||
float fval;
|
float fval;
|
||||||
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||||
|
|
||||||
fMax = F_MAX(fMax, fval);
|
fMax = F_MAX(fMax, fval);
|
||||||
}
|
}
|
||||||
|
|
||||||
float fProbSum = 0.0f;
|
float fProbSum = 0.0f;
|
||||||
vxc_short8 dst;
|
vxc_short8 dst;
|
||||||
for (int i = 0; i < sf_size; i++)
|
for (int i = 0; i < sf_size; i++)
|
||||||
{
|
{
|
||||||
vxc_char8 val;
|
vxc_short8 val;
|
||||||
|
vxc_half8 val_h;
|
||||||
coord_in.x = i;
|
coord_in.x = i;
|
||||||
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
|
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, val_h, val, 16);
|
||||||
float fval;
|
float fval;
|
||||||
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||||
|
|
||||||
float fOut = (float)exp(fval - fMax);
|
float fOut = (float)exp(fval - fMax);
|
||||||
fProbSum += fOut;
|
fProbSum += fOut;
|
||||||
half hVal;
|
half hVal;
|
||||||
_viv_asm(CONV,hVal,fOut);
|
_viv_asm(CONV, hVal, fOut);
|
||||||
_viv_asm(COPY,dst,hVal, 4);
|
_viv_asm(COPY, dst, hVal, 4);
|
||||||
|
|
||||||
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -56,15 +63,68 @@ __kernel void Softmax2VXC
|
||||||
vxc_short8 val;
|
vxc_short8 val;
|
||||||
vxc_half8 val_h;
|
vxc_half8 val_h;
|
||||||
coord_in.x = i;
|
coord_in.x = i;
|
||||||
VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
|
VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
float fval;
|
float fval;
|
||||||
_viv_asm(COPY, val_h,val, 16);
|
_viv_asm(COPY, val_h,val, 16);
|
||||||
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||||
|
float fOut =fval / fProbSum;
|
||||||
float fOut =fval/fProbSum;
|
|
||||||
half hVal;
|
half hVal;
|
||||||
_viv_asm(CONV,hVal,fOut);
|
_viv_asm(CONV, hVal, fOut);
|
||||||
_viv_asm(COPY,dst,hVal, 4);
|
_viv_asm(COPY,dst,hVal, 4);
|
||||||
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__kernel void Softmax2VXC_u8
|
||||||
|
(
|
||||||
|
image2d_array_t input,
|
||||||
|
image2d_array_t output,
|
||||||
|
int axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
|
||||||
|
float fMax = -3.4e38f;
|
||||||
|
for (int i = 0; i < sf_size; i++)
|
||||||
|
{
|
||||||
|
vxc_uchar8 val;
|
||||||
|
coord_in.x = i;
|
||||||
|
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
float fval;
|
||||||
|
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||||
|
fval = (fval - srcZP) * srcScale;
|
||||||
|
fMax = F_MAX(fMax, fval);
|
||||||
|
}
|
||||||
|
|
||||||
|
float fProbSum = 0.0f;
|
||||||
|
vxc_uchar8 dst;
|
||||||
|
for (int i = 0; i < sf_size; i++)
|
||||||
|
{
|
||||||
|
vxc_uchar8 val;
|
||||||
|
|
||||||
|
coord_in.x = i;
|
||||||
|
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
float fval;
|
||||||
|
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||||
|
fval = (fval - srcZP) * srcScale;
|
||||||
|
float fOut = (float)exp(fval - fMax);
|
||||||
|
fProbSum += fOut;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < sf_size; i++)
|
||||||
|
{
|
||||||
|
vxc_uchar8 val;
|
||||||
|
coord_in.x = i;
|
||||||
|
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
float fval;
|
||||||
|
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||||
|
fval = (fval - srcZP) * srcScale;
|
||||||
|
|
||||||
|
float fOut = exp(fval - fMax) / fProbSum;
|
||||||
|
|
||||||
|
fOut = fOut * dstScale + dstZP;
|
||||||
|
short dst0;
|
||||||
|
_viv_asm(CONV, dst0, fOut);
|
||||||
|
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8);
|
||||||
|
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -16,7 +16,7 @@ _viv_uniform float sum_x2_tail1;
|
||||||
_viv_uniform float output_scale;
|
_viv_uniform float output_scale;
|
||||||
_viv_uniform float output_zp;
|
_viv_uniform float output_zp;
|
||||||
|
|
||||||
#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \
|
#define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \
|
||||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
|
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
|
||||||
__read_only image2d_array_t input, \
|
__read_only image2d_array_t input, \
|
||||||
__write_only image2d_array_t output, \
|
__write_only image2d_array_t output, \
|
||||||
|
|
@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
||||||
int lidx = get_local_id(0); \
|
int lidx = get_local_id(0); \
|
||||||
int gidz = get_global_id(1); \
|
int gidz = get_global_id(1); \
|
||||||
int4 coord = (int4)(gidx, 0, gidz, 0); \
|
int4 coord = (int4)(gidx, 0, gidz, 0); \
|
||||||
vxc_short8 src0; \
|
load_type src; \
|
||||||
src_type in_h; \
|
src_type in_h; \
|
||||||
float4 sumsqr; \
|
float4 sumsqr; \
|
||||||
float4 tmpSumSqr = (float4)(0); \
|
float4 tmpSumSqr = (float4)(0); \
|
||||||
|
|
@ -43,9 +43,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
||||||
{ \
|
{ \
|
||||||
for(coord.y = 0; coord.y < height;) \
|
for(coord.y = 0; coord.y < height;) \
|
||||||
{ \
|
{ \
|
||||||
VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
coord.y++; \
|
coord.y++; \
|
||||||
_viv_asm(COPY, in_h, src0, 16); \
|
_viv_asm(COPY, in_h, src, 16); \
|
||||||
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
|
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
|
||||||
tmpSumSqr += sumsqr; \
|
tmpSumSqr += sumsqr; \
|
||||||
} \
|
} \
|
||||||
|
|
@ -76,10 +76,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
||||||
write_imagef(output, coord_out, data); \
|
write_imagef(output, coord_out, data); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)
|
GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8, vxc_half8)
|
||||||
GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
|
GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8, vxc_short8)
|
||||||
|
GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8)
|
||||||
|
|
||||||
#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \
|
#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \
|
||||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
|
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
|
||||||
__read_only image2d_array_t input, \
|
__read_only image2d_array_t input, \
|
||||||
__write_only image2d_array_t output, \
|
__write_only image2d_array_t output, \
|
||||||
|
|
@ -89,7 +90,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
||||||
int lidx = get_local_id(0); \
|
int lidx = get_local_id(0); \
|
||||||
\
|
\
|
||||||
int2 coord = (int2)(gidx, get_global_id(1)); \
|
int2 coord = (int2)(gidx, get_global_id(1)); \
|
||||||
vxc_short8 src0; \
|
load_type src; \
|
||||||
src_type in_h; \
|
src_type in_h; \
|
||||||
float4 sumsqr = (float4)(0); \
|
float4 sumsqr = (float4)(0); \
|
||||||
\
|
\
|
||||||
|
|
@ -98,8 +99,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
||||||
\
|
\
|
||||||
if(gidx < width) \
|
if(gidx < width) \
|
||||||
{ \
|
{ \
|
||||||
VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
_viv_asm(COPY, in_h, src0, 16); \
|
_viv_asm(COPY, in_h, src, 16); \
|
||||||
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
|
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
|
||||||
sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \
|
sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \
|
||||||
sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \
|
sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \
|
||||||
|
|
@ -128,8 +129,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
||||||
write_imagef(output, coord_out, data); \
|
write_imagef(output, coord_out, data); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
|
GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8)
|
||||||
GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
|
GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8)
|
||||||
|
GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8)
|
||||||
|
|
||||||
#define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
|
#define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
|
||||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
|
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
|
||||||
|
|
@ -178,7 +180,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
|
||||||
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
||||||
norm = alpha * tmpData1 + bias_val; \
|
norm = alpha * tmpData1 + bias_val; \
|
||||||
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
||||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
_viv_asm(COPY, outval, dst, 16); \
|
_viv_asm(COPY, outval, dst, 16); \
|
||||||
VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
}
|
}
|
||||||
|
|
@ -230,10 +232,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
|
||||||
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||||
float4 norm; \
|
float4 norm; \
|
||||||
norm = alpha * tmpData0 + bias_val; \
|
norm = alpha * tmpData0 + bias_val; \
|
||||||
_viv_asm(CONV, tmpVal0, norm); \
|
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
||||||
norm = alpha * tmpData1 + bias_val; \
|
norm = alpha * tmpData1 + bias_val; \
|
||||||
_viv_asm(CONV, tmpVal1, norm); \
|
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
||||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
_viv_asm(COPY, outval, dst, 16); \
|
_viv_asm(COPY, outval, dst, 16); \
|
||||||
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
}
|
}
|
||||||
|
|
@ -283,7 +285,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
|
||||||
\
|
\
|
||||||
float4 norm; \
|
float4 norm; \
|
||||||
norm = alpha * tmpData0 + bias_val; \
|
norm = alpha * tmpData0 + bias_val; \
|
||||||
_viv_asm(CONV, tmpVal0, norm); \
|
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
||||||
norm = alpha * tmpData1 + bias_val; \
|
norm = alpha * tmpData1 + bias_val; \
|
||||||
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
||||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
|
@ -296,6 +298,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int
|
||||||
GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
|
GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
|
||||||
GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
|
GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
|
||||||
GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
|
GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
|
||||||
|
GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
|
||||||
|
|
||||||
#define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
|
#define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
|
||||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
|
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
|
||||||
|
|
@ -333,10 +336,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
|
||||||
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||||
float4 norm; \
|
float4 norm; \
|
||||||
norm = alpha * tmpData0 + bias_val; \
|
norm = alpha * tmpData0 + bias_val; \
|
||||||
_viv_asm(CONV, tmpVal0, norm); \
|
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
||||||
norm = alpha * tmpData1 + bias_val; \
|
norm = alpha * tmpData1 + bias_val; \
|
||||||
_viv_asm(CONV, tmpVal1, norm); \
|
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
||||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
_viv_asm(COPY, outval, dst, 16); \
|
_viv_asm(COPY, outval, dst, 16); \
|
||||||
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
}
|
}
|
||||||
|
|
@ -346,4 +349,5 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8,
|
||||||
GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
|
GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
|
||||||
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
|
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
|
||||||
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
|
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
|
||||||
|
GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -115,45 +115,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;
|
||||||
_viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;
|
_viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;
|
||||||
_viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;
|
_viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;
|
||||||
_viv_uniform VXC_512Bits uniExtact8Bin_2x8;
|
_viv_uniform VXC_512Bits uniExtact8Bin_2x8;
|
||||||
_viv_uniform int inputZP0;
|
_viv_uniform int input0_zp;
|
||||||
_viv_uniform int inputZP1;
|
_viv_uniform int input1_zp;
|
||||||
_viv_uniform float input_scale0;
|
_viv_uniform float input0_scale;
|
||||||
_viv_uniform float input_scale1;
|
_viv_uniform float input1_scale;
|
||||||
_viv_uniform float outputZP;
|
_viv_uniform float output_zp;
|
||||||
#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
|
#define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
|
||||||
__kernel void prelu_##name0##to##name1( \
|
__kernel void prelu_##name( \
|
||||||
__read_only image2d_array_t input0, \
|
__read_only image2d_array_t input0, \
|
||||||
__read_only image2d_array_t input1, \
|
__read_only image2d_array_t input1, \
|
||||||
__write_only image2d_array_t output) \
|
__write_only image2d_array_t output) \
|
||||||
{\
|
{\
|
||||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
|
||||||
vxc_float4 vecA, vecB, vecC, vecD;\
|
float4 vecA, vecB, vecC, vecD;\
|
||||||
input_type0 srcA;\
|
input_type0 srcA;\
|
||||||
copy_type0 src0;\
|
copy_type0 src0;\
|
||||||
vxc_short8 srcB;\
|
vxc_short8 srcB;\
|
||||||
vxc_half8 src1;\
|
vxc_half8 src1;\
|
||||||
input_type0 input_ZP;\
|
input_type0 zp;\
|
||||||
VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||||
_viv_asm(COPY, src0, srcA, 16); \
|
_viv_asm(COPY, src0, srcA, 16); \
|
||||||
VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||||
_viv_asm(COPY, src1, srcB, 16); \
|
_viv_asm(COPY, src1, srcB, 16); \
|
||||||
\
|
\
|
||||||
_viv_asm(COPY, input_ZP, inputZP0, 4);\
|
_viv_asm(COPY, zp, input0_zp, 4);\
|
||||||
VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
|
VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
|
||||||
uniDataSubZPtoFp32Part0_4x4); \
|
uniDataSubZPtoFp32Part0_4x4); \
|
||||||
VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
|
VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
|
||||||
uniDataSubZPtoFp32Part1_4x4);\
|
uniDataSubZPtoFp32Part1_4x4);\
|
||||||
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
|
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
|
||||||
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
|
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
|
||||||
\
|
\
|
||||||
vecA = vecA * input_scale0;\
|
vecA = vecA * input0_scale;\
|
||||||
vecB = vecB * input_scale0;\
|
vecB = vecB * input0_scale;\
|
||||||
vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
|
float4 maxData0 = vecA > 0 ? vecA : 0.0; \
|
||||||
vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
|
float4 maxData1 = vecB > 0 ? vecB : 0.0; \
|
||||||
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
||||||
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
||||||
vecA = maxData0 + vecC * minData0 + outputZP;\
|
vecA = maxData0 + vecC * minData0 + output_zp;\
|
||||||
vecB = maxData1 + vecD * minData1 + outputZP;\
|
vecB = maxData1 + vecD * minData1 + output_zp;\
|
||||||
convert_type dst0, dst1;\
|
convert_type dst0, dst1;\
|
||||||
_viv_asm(CONV_RTE, dst0, vecA);\
|
_viv_asm(CONV_RTE, dst0, vecA);\
|
||||||
_viv_asm(CONV_RTE, dst1, vecB);\
|
_viv_asm(CONV_RTE, dst1, vecB);\
|
||||||
|
|
@ -164,49 +164,49 @@ _viv_uniform float outputZP;
|
||||||
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||||
}
|
}
|
||||||
// name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type
|
// name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type
|
||||||
PRELU_F16_3D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
|
PRELU_F16_3D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
|
||||||
PRELU_F16_3D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
|
PRELU_F16_3D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
|
||||||
PRELU_F16_3D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
|
PRELU_F16_3D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
|
||||||
PRELU_F16_3D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
|
PRELU_F16_3D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
|
||||||
PRELU_F16_3D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
PRELU_F16_3D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
||||||
PRELU_F16_3D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
PRELU_F16_3D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
||||||
PRELU_F16_3D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
|
PRELU_F16_3D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
|
||||||
PRELU_F16_3D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
|
PRELU_F16_3D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
|
||||||
PRELU_F16_3D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
|
PRELU_F16_3D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
|
||||||
PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
|
PRELU_F16_3D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
|
||||||
|
|
||||||
#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
|
#define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
|
||||||
__kernel void prelu_##name0##to##name1##_2D( \
|
__kernel void prelu_##name##_2D( \
|
||||||
__read_only image2d_array_t input0, \
|
__read_only image2d_array_t input0, \
|
||||||
__read_only image2d_array_t input1, \
|
__read_only image2d_array_t input1, \
|
||||||
__write_only image2d_array_t output) \
|
__write_only image2d_array_t output) \
|
||||||
{\
|
{\
|
||||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));\
|
int2 coord = (int2)(get_global_id(0), get_global_id(1));\
|
||||||
vxc_float4 vecA, vecB, vecC, vecD;\
|
float4 vecA, vecB, vecC, vecD;\
|
||||||
input_type0 srcA;\
|
input_type0 srcA;\
|
||||||
copy_type0 src0;\
|
copy_type0 src0;\
|
||||||
vxc_short8 srcB;\
|
vxc_short8 srcB;\
|
||||||
vxc_half8 src1;\
|
vxc_half8 src1;\
|
||||||
input_type0 input_ZP;\
|
input_type0 zp;\
|
||||||
VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||||
_viv_asm(COPY, src0, srcA, 16); \
|
_viv_asm(COPY, src0, srcA, 16); \
|
||||||
VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||||
_viv_asm(COPY, src1, srcB, 16); \
|
_viv_asm(COPY, src1, srcB, 16); \
|
||||||
\
|
\
|
||||||
_viv_asm(COPY, input_ZP, inputZP0, 4);\
|
_viv_asm(COPY, zp, input0_zp, 4);\
|
||||||
VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
||||||
VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
||||||
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
|
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
|
||||||
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
|
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
|
||||||
\
|
\
|
||||||
vecA = vecA * input_scale0;\
|
vecA = vecA * input0_scale;\
|
||||||
vecB = vecB * input_scale0;\
|
vecB = vecB * input0_scale;\
|
||||||
vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
|
float4 maxData0 = vecA > 0 ? vecA : 0.0; \
|
||||||
vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
|
float4 maxData1 = vecB > 0 ? vecB : 0.0; \
|
||||||
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
||||||
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
||||||
vecA = maxData0 + vecC * minData0 + outputZP;\
|
vecA = maxData0 + vecC * minData0 + output_zp;\
|
||||||
vecB = maxData1 + vecD * minData1 + outputZP;\
|
vecB = maxData1 + vecD * minData1 + output_zp;\
|
||||||
convert_type dst0, dst1;\
|
convert_type dst0, dst1;\
|
||||||
_viv_asm(CONV_RTE, dst0, vecA);\
|
_viv_asm(CONV_RTE, dst0, vecA);\
|
||||||
_viv_asm(CONV_RTE, dst1, vecB);\
|
_viv_asm(CONV_RTE, dst1, vecB);\
|
||||||
|
|
@ -216,49 +216,49 @@ PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha
|
||||||
_viv_asm(COPY, dst, dst2, 16); \
|
_viv_asm(COPY, dst, dst2, 16); \
|
||||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||||
}
|
}
|
||||||
PRELU_F16_2D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
|
PRELU_F16_2D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
|
||||||
PRELU_F16_2D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
|
PRELU_F16_2D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
|
||||||
PRELU_F16_2D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
|
PRELU_F16_2D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
|
||||||
PRELU_F16_2D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
PRELU_F16_2D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
||||||
PRELU_F16_2D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
PRELU_F16_2D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
||||||
PRELU_F16_2D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
|
PRELU_F16_2D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
|
||||||
PRELU_F16_2D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
|
PRELU_F16_2D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
|
||||||
PRELU_F16_2D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
|
PRELU_F16_2D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
|
||||||
PRELU_F16_2D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
|
PRELU_F16_2D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
|
||||||
PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
|
PRELU_F16_2D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
|
||||||
|
|
||||||
#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \
|
#define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \
|
||||||
__kernel void prelu_U8U8to##name##_2D( \
|
__kernel void prelu_##name##_2D( \
|
||||||
__read_only image2d_array_t input0, \
|
__read_only image2d_array_t input0, \
|
||||||
__read_only image2d_array_t input1, \
|
__read_only image2d_array_t input1, \
|
||||||
__write_only image2d_array_t output) \
|
__write_only image2d_array_t output) \
|
||||||
{\
|
{\
|
||||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));\
|
int2 coord = (int2)(get_global_id(0), get_global_id(1));\
|
||||||
vxc_float4 vecA, vecB, vecC, vecD;\
|
float4 vecA, vecB, vecC, vecD;\
|
||||||
vxc_uchar16 src0;\
|
src0_type src0;\
|
||||||
vxc_uchar16 src1;\
|
src1_type src1;\
|
||||||
vxc_uchar16 input_ZP0;\
|
short zp0;\
|
||||||
vxc_uchar16 input_ZP1;\
|
short zp1;\
|
||||||
VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||||
VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||||
\
|
\
|
||||||
_viv_asm(COPY, input_ZP0, inputZP0, 4);\
|
_viv_asm(COPY, zp0, input0_zp, 2);\
|
||||||
VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
||||||
VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
||||||
_viv_asm(COPY, input_ZP1, inputZP1, 4);\
|
_viv_asm(COPY, zp1, input1_zp, 4);\
|
||||||
VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
||||||
VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
||||||
\
|
\
|
||||||
vecA = vecA * input_scale0;\
|
vecA = vecA * input0_scale;\
|
||||||
vecB = vecB * input_scale0;\
|
vecB = vecB * input0_scale;\
|
||||||
vecC = vecC * input_scale1;\
|
vecC = vecC * input1_scale;\
|
||||||
vecD = vecD * input_scale1;\
|
vecD = vecD * input1_scale;\
|
||||||
vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
|
float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
|
||||||
vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
|
float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
|
||||||
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
||||||
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
||||||
vecA = maxData0 + vecC * minData0 + outputZP;\
|
vecA = maxData0 + vecC * minData0 + output_zp;\
|
||||||
vecB = maxData1 + vecD * minData1 + outputZP;\
|
vecB = maxData1 + vecD * minData1 + output_zp;\
|
||||||
convert_type dst0, dst1;\
|
convert_type dst0, dst1;\
|
||||||
_viv_asm(CONV_RTE, dst0, vecA);\
|
_viv_asm(CONV_RTE, dst0, vecA);\
|
||||||
_viv_asm(CONV_RTE, dst1, vecB);\
|
_viv_asm(CONV_RTE, dst1, vecB);\
|
||||||
|
|
@ -268,7 +268,8 @@ PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha
|
||||||
_viv_asm(COPY, dst, dst2, 16); \
|
_viv_asm(COPY, dst, dst2, 16); \
|
||||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||||
}
|
}
|
||||||
PRELU_U8_2D(U8, vxc_uchar16, int4, vxc_uchar16)
|
PRELU_INTEGER_2D(U8U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
||||||
PRELU_U8_2D(F16, vxc_half8, half4, vxc_short8)
|
PRELU_INTEGER_2D(U8U8toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,181 @@
|
||||||
|
#include "cl_viv_vx_ext.h"
|
||||||
|
|
||||||
|
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
|
||||||
|
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
|
||||||
|
_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;
|
||||||
|
_viv_uniform int out_height;
|
||||||
|
|
||||||
|
__kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int align_corners,
|
||||||
|
int half_pixel_centers
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
|
||||||
|
coord_in.x = (coord_out.x * 2 - 1) >> 2;
|
||||||
|
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
|
||||||
|
|
||||||
|
vxc_uchar16 in0, in1, tmp, result;
|
||||||
|
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
vxc_ushort8 multiplier;
|
||||||
|
_viv_asm(COPY, multiplier, multAndoutZP, 16);
|
||||||
|
|
||||||
|
vxc_ushort8 dst0;
|
||||||
|
while (coord_out.y < out_height)
|
||||||
|
{
|
||||||
|
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
|
||||||
|
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
|
||||||
|
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
|
||||||
|
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
|
||||||
|
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
|
||||||
|
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
|
||||||
|
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
|
||||||
|
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
|
||||||
|
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_in.y += 2;
|
||||||
|
coord_out.y++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;
|
||||||
|
__kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int align_corners,
|
||||||
|
int half_pixel_centers
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
|
||||||
|
coord_in.x = (coord_out.x * 2 - 3) >> 3;
|
||||||
|
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
|
||||||
|
|
||||||
|
vxc_uchar16 in0, in1, dst0, dst1;
|
||||||
|
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
vxc_ushort8 multiplier;
|
||||||
|
_viv_asm(COPY, multiplier, multAndoutZP, 16);
|
||||||
|
|
||||||
|
vxc_ushort8 tmp;
|
||||||
|
while (coord_out.y < out_height)
|
||||||
|
{
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_in.y += 2;
|
||||||
|
coord_out.y++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,102 @@
|
||||||
|
#include "cl_viv_vx_ext.h"
|
||||||
|
|
||||||
|
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
|
||||||
|
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
|
||||||
|
_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;
|
||||||
|
__kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int align_corners,
|
||||||
|
int half_pixel_centers
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||||
|
coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;
|
||||||
|
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
|
||||||
|
coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
|
||||||
|
coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;
|
||||||
|
|
||||||
|
vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;
|
||||||
|
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
vxc_ushort8 multiplier;
|
||||||
|
_viv_asm(COPY, multiplier, multAndoutZP, 16);
|
||||||
|
|
||||||
|
vxc_ushort8 data;
|
||||||
|
|
||||||
|
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
|
||||||
|
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
|
||||||
|
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
|
||||||
|
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
|
||||||
|
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||||
|
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
|
||||||
|
VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);
|
||||||
|
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);
|
||||||
|
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
|
||||||
|
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
|
||||||
|
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
|
||||||
|
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
|
||||||
|
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
|
||||||
|
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
|
||||||
|
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
|
||||||
|
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
|
||||||
|
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||||
|
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
|
||||||
|
VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);
|
||||||
|
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);
|
||||||
|
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
|
||||||
|
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
|
||||||
|
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
|
||||||
|
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
|
||||||
|
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,167 @@
|
||||||
|
#include "cl_viv_vx_ext.h"
|
||||||
|
|
||||||
|
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
|
||||||
|
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
|
||||||
|
_viv_uniform int out_height;
|
||||||
|
_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;
|
||||||
|
_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;
|
||||||
|
__kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int align_corners,
|
||||||
|
int half_pixel_centers
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
|
||||||
|
coord_in.x = (coord_out.x * 2 - 7) >> 4;
|
||||||
|
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
|
||||||
|
|
||||||
|
vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3;
|
||||||
|
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
vxc_ushort8 multiplier;
|
||||||
|
_viv_asm(COPY, multiplier, multAndoutZP, 16);
|
||||||
|
|
||||||
|
vxc_ushort8 tmp;
|
||||||
|
while (coord_out.y < out_height)
|
||||||
|
{
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
|
||||||
|
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
|
||||||
|
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
|
||||||
|
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
|
||||||
|
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
|
||||||
|
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
|
||||||
|
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
|
||||||
|
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
|
||||||
|
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
|
||||||
|
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
|
||||||
|
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
|
||||||
|
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
|
||||||
|
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_in.y += 2;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
|
||||||
|
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
|
||||||
|
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
|
||||||
|
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
|
||||||
|
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
|
||||||
|
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
|
||||||
|
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
|
||||||
|
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_out.y++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,303 @@
|
||||||
|
#include "cl_viv_vx_ext.h"
|
||||||
|
|
||||||
|
_viv_uniform float scale0;
|
||||||
|
_viv_uniform float scale1;
|
||||||
|
_viv_uniform float output_zp;
|
||||||
|
_viv_uniform int half_head_size;
|
||||||
|
_viv_uniform VXC_512Bits uniATimesB_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniATimesB_1_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||||
|
|
||||||
|
#define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \
|
||||||
|
__kernel void rope_##name##_bnhs \
|
||||||
|
( \
|
||||||
|
__read_only image2d_array_t input, \
|
||||||
|
__read_only image2d_array_t cos_cache, \
|
||||||
|
__read_only image2d_array_t sin_cache, \
|
||||||
|
__write_only image2d_array_t output, \
|
||||||
|
int axis \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||||
|
int4 coord_out = coord_in; \
|
||||||
|
\
|
||||||
|
int8 input_desc; \
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||||
|
\
|
||||||
|
src_type data0, data1; \
|
||||||
|
src1_type cos, sin; \
|
||||||
|
copy_type v0, v1; \
|
||||||
|
dst_type dst; \
|
||||||
|
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, cos, v0, 16); \
|
||||||
|
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, sin, v1, 16); \
|
||||||
|
coord_in.y += half_head_size; \
|
||||||
|
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
int8 output_desc; \
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||||
|
\
|
||||||
|
float4 data2, data3, data4, data5; \
|
||||||
|
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||||
|
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||||
|
data3 = data3 * scale0 - data5 * scale1 + output_zp; \
|
||||||
|
\
|
||||||
|
int4 dst0 = convert_int4_rte(data2); \
|
||||||
|
int4 dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||||
|
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||||
|
data2 = data2 * scale1 + data4 * scale0 + output_zp; \
|
||||||
|
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
dst0 = convert_int4_rte(data2); \
|
||||||
|
dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
coord_out.y += half_head_size; \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||||
|
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
}
|
||||||
|
ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||||
|
ROPE_BNHS_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_BNHS_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
|
||||||
|
ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||||
|
ROPE_BNHS_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_BNHS_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||||
|
|
||||||
|
__kernel void rope_F16_F16toF16_bnhs
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
int4 coord_out = coord_in;
|
||||||
|
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
|
||||||
|
vxc_short8 v0, v1, v2, v3, dst;
|
||||||
|
vxc_half8 data0, data1, cos, sin, dst2;
|
||||||
|
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, data0, v0, 16);
|
||||||
|
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, cos, v1, 16);
|
||||||
|
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, sin, v2, 16);
|
||||||
|
coord_in.y += half_head_size;
|
||||||
|
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, data1, v3, 16);
|
||||||
|
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
float4 data2, data3, data4, data5;
|
||||||
|
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||||
|
data2 = data2 - data4;
|
||||||
|
data3 = data3 - data5;
|
||||||
|
|
||||||
|
half4 dst0;
|
||||||
|
half4 dst1;
|
||||||
|
_viv_asm(CONV_RTE, dst0, data2);
|
||||||
|
_viv_asm(CONV_RTE, dst1, data3);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst2, 16);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||||
|
data2 = data2 * scale1 + data4 * scale0 + output_zp;
|
||||||
|
data3 = data3 * scale1 + data5 * scale0 + output_zp;
|
||||||
|
|
||||||
|
_viv_asm(CONV_RTE, dst0, data2);
|
||||||
|
_viv_asm(CONV_RTE, dst1, data3);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst2, 16);
|
||||||
|
coord_out.y += half_head_size;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
_viv_uniform int in0_zp;
|
||||||
|
_viv_uniform int cos_zp;
|
||||||
|
_viv_uniform int sin_zp;
|
||||||
|
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
|
||||||
|
#define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \
|
||||||
|
__kernel void rope_##name##_bnhs \
|
||||||
|
( \
|
||||||
|
__read_only image2d_array_t input, \
|
||||||
|
__read_only image2d_array_t cos_cache, \
|
||||||
|
__read_only image2d_array_t sin_cache, \
|
||||||
|
__write_only image2d_array_t output, \
|
||||||
|
int axis \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||||
|
int4 coord_out = coord_in; \
|
||||||
|
\
|
||||||
|
int8 input_desc; \
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||||
|
\
|
||||||
|
dtype data0, data1, dst; \
|
||||||
|
src1_type cos, sin; \
|
||||||
|
copy_type v0, v1; \
|
||||||
|
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, cos, v0, 16); \
|
||||||
|
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, sin, v1, 16); \
|
||||||
|
coord_in.y += half_head_size; \
|
||||||
|
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
int8 output_desc; \
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||||
|
\
|
||||||
|
float4 l00, l01, cos0, cos1; \
|
||||||
|
float4 l10, l11, sin0, sin1; \
|
||||||
|
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
|
||||||
|
float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
|
||||||
|
\
|
||||||
|
int4 dst0 = convert_int4_rte(data2); \
|
||||||
|
int4 dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, \
|
||||||
|
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
|
||||||
|
data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
dst0 = convert_int4_rte(data2); \
|
||||||
|
dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
coord_out.y += half_head_size; \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, \
|
||||||
|
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
}
|
||||||
|
ROPE_ASYM_BNHS(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
|
||||||
|
ROPE_ASYM_BNHS(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
|
||||||
|
ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
|
||||||
|
ROPE_ASYM_BNHS(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_ASYM_BNHS(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||||
|
ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
|
||||||
|
|
||||||
|
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||||
|
__kernel void rope_BF16_BF16toBF16_bnhs
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
int4 coord_out = coord_in;
|
||||||
|
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
|
||||||
|
vxc_ushort8 v0, v1, v2, v3, dst;
|
||||||
|
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord_in.y += half_head_size;
|
||||||
|
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
|
||||||
|
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
vxc_short8 data;
|
||||||
|
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, src0, data, 16);
|
||||||
|
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, src1, data, 16);
|
||||||
|
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, cos0, data, 16);
|
||||||
|
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, cos1, data, 16);
|
||||||
|
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, sin0, data, 16);
|
||||||
|
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, sin1, data, 16);
|
||||||
|
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, src2, data, 16);
|
||||||
|
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, src3, data, 16);
|
||||||
|
|
||||||
|
float4 data0 = src0 * cos0 - src2 * sin0;
|
||||||
|
float4 data1 = src1 * cos1 - src3 * sin1;
|
||||||
|
|
||||||
|
_viv_asm(COPY, v0, data0, 16);
|
||||||
|
_viv_asm(COPY, v1, data1, 16);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
data0 = src0 * sin0 + src2 * cos0;
|
||||||
|
data1 = src1 * sin1 + src3 * cos1;
|
||||||
|
|
||||||
|
_viv_asm(COPY, v0, data0, 16);
|
||||||
|
_viv_asm(COPY, v1, data1, 16);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||||
|
coord_out.y += half_head_size;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,245 @@
|
||||||
|
#include "cl_viv_vx_ext.h"
|
||||||
|
|
||||||
|
_viv_uniform float scale0;
|
||||||
|
_viv_uniform float scale1;
|
||||||
|
_viv_uniform float output_zp;
|
||||||
|
_viv_uniform int half_head_size;
|
||||||
|
_viv_uniform VXC_512Bits uniATimesB_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniATimesB_1_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||||
|
|
||||||
|
#define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \
|
||||||
|
__kernel void rope_##name##_bnh1 \
|
||||||
|
( \
|
||||||
|
__read_only image2d_array_t input, \
|
||||||
|
__read_only image2d_array_t cos_cache, \
|
||||||
|
__read_only image2d_array_t sin_cache, \
|
||||||
|
__write_only image2d_array_t output, \
|
||||||
|
int axis \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
|
||||||
|
\
|
||||||
|
src_type data0, data1; \
|
||||||
|
src1_type cos, sin; \
|
||||||
|
copy_type v0, v1; \
|
||||||
|
VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, cos, v0, 16); \
|
||||||
|
VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, sin, v1, 16); \
|
||||||
|
coord.x += half_head_size; \
|
||||||
|
VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
float4 data2, data3, data4, data5; \
|
||||||
|
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||||
|
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||||
|
data3 = data3 * scale0 - data5 * scale1 + output_zp; \
|
||||||
|
\
|
||||||
|
int4 dst0 = convert_int4_rte(data2); \
|
||||||
|
int4 dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
dst_type dst; \
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||||
|
data2 = data2 * scale1 + data4 * scale0 + output_zp; \
|
||||||
|
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
dst0 = convert_int4_rte(data2); \
|
||||||
|
dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
}
|
||||||
|
ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||||
|
ROPE_BNH1_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_BNH1_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
|
||||||
|
ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||||
|
ROPE_BNH1_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_BNH1_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||||
|
|
||||||
|
__kernel void rope_F16_F16toF16_bnh1
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||||
|
|
||||||
|
vxc_short8 v0, v1, v2, v3, dst;
|
||||||
|
vxc_half8 data0, data1, cos, sin, dst2;
|
||||||
|
VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, data0, v0, 16);
|
||||||
|
VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, cos, v1, 16);
|
||||||
|
VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, sin, v2, 16);
|
||||||
|
coord.x += half_head_size;
|
||||||
|
VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, data1, v3, 16);
|
||||||
|
|
||||||
|
float4 data2, data3, data4, data5;
|
||||||
|
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||||
|
data2 = data2 - data4;
|
||||||
|
data3 = data3 - data5;
|
||||||
|
|
||||||
|
half4 dst0;
|
||||||
|
half4 dst1;
|
||||||
|
_viv_asm(CONV_RTE, dst0, data2);
|
||||||
|
_viv_asm(CONV_RTE, dst1, data3);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst2, 16);
|
||||||
|
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||||
|
data2 = data2 + data4;
|
||||||
|
data3 = data3 + data5;
|
||||||
|
|
||||||
|
_viv_asm(CONV_RTE, dst0, data2);
|
||||||
|
_viv_asm(CONV_RTE, dst1, data3);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst2, 16);
|
||||||
|
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
_viv_uniform int in0_zp;
|
||||||
|
_viv_uniform int cos_zp;
|
||||||
|
_viv_uniform int sin_zp;
|
||||||
|
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
|
||||||
|
#define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \
|
||||||
|
__kernel void rope_##name##_bnh1 \
|
||||||
|
( \
|
||||||
|
__read_only image2d_array_t input, \
|
||||||
|
__read_only image2d_array_t cos_cache, \
|
||||||
|
__read_only image2d_array_t sin_cache, \
|
||||||
|
__write_only image2d_array_t output, \
|
||||||
|
int axis \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
|
||||||
|
\
|
||||||
|
dtype data0, data1, dst; \
|
||||||
|
src1_type cos, sin; \
|
||||||
|
copy_type v0, v1; \
|
||||||
|
VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, cos, v0, 16); \
|
||||||
|
VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, sin, v1, 16); \
|
||||||
|
coord.x += half_head_size; \
|
||||||
|
VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
float4 l00, l01, cos0, cos1; \
|
||||||
|
float4 l10, l11, sin0, sin1; \
|
||||||
|
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
|
||||||
|
float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
|
||||||
|
\
|
||||||
|
int4 dst0 = convert_int4_rte(data2); \
|
||||||
|
int4 dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
|
||||||
|
data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
dst0 = convert_int4_rte(data2); \
|
||||||
|
dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
}
|
||||||
|
ROPE_ASYM_BNH1(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
|
||||||
|
ROPE_ASYM_BNH1(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
|
||||||
|
ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
|
||||||
|
ROPE_ASYM_BNH1(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_ASYM_BNH1(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||||
|
ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
|
||||||
|
|
||||||
|
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||||
|
__kernel void rope_BF16_BF16toBF16_bnh1
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||||
|
|
||||||
|
vxc_ushort8 v0, v1, v2, v3, dst;
|
||||||
|
VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
coord.x += half_head_size;
|
||||||
|
VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
|
||||||
|
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
vxc_short8 data;
|
||||||
|
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, src0, data, 16);
|
||||||
|
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, src1, data, 16);
|
||||||
|
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, cos0, data, 16);
|
||||||
|
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, cos1, data, 16);
|
||||||
|
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, sin0, data, 16);
|
||||||
|
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, sin1, data, 16);
|
||||||
|
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, src2, data, 16);
|
||||||
|
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, src3, data, 16);
|
||||||
|
|
||||||
|
float4 data0 = src0 * cos0 - src2 * sin0;
|
||||||
|
float4 data1 = src1 * cos1 - src3 * sin1;
|
||||||
|
|
||||||
|
_viv_asm(COPY, v0, data0, 16);
|
||||||
|
_viv_asm(COPY, v1, data1, 16);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||||
|
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
data0 = src0 * sin0 + src2 * cos0;
|
||||||
|
data1 = src1 * sin1 + src3 * cos1;
|
||||||
|
|
||||||
|
_viv_asm(COPY, v0, data0, 16);
|
||||||
|
_viv_asm(COPY, v1, data1, 16);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||||
|
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,312 @@
|
||||||
|
#include "cl_viv_vx_ext.h"
|
||||||
|
|
||||||
|
_viv_uniform float scale0;
|
||||||
|
_viv_uniform float scale1;
|
||||||
|
_viv_uniform float output_zp;
|
||||||
|
_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||||
|
|
||||||
|
#define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
|
||||||
|
__kernel void rope_##name##_bsnh \
|
||||||
|
( \
|
||||||
|
__read_only image2d_array_t input, \
|
||||||
|
__read_only image2d_array_t cos_cache, \
|
||||||
|
__read_only image2d_array_t sin_cache, \
|
||||||
|
__write_only image2d_array_t output, \
|
||||||
|
int axis \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||||
|
\
|
||||||
|
src_type data0, data1; \
|
||||||
|
src1_type cos, sin; \
|
||||||
|
copy_type v0, v1; \
|
||||||
|
dst_type dst; \
|
||||||
|
VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, cos, v0, 16); \
|
||||||
|
VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, sin, v1, 16); \
|
||||||
|
\
|
||||||
|
coord_in.x *= 2; \
|
||||||
|
int8 input_desc; \
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||||
|
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
int4 coord_out = coord_in; \
|
||||||
|
int8 output_desc; \
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||||
|
\
|
||||||
|
float4 data2, data3, data4, data5; \
|
||||||
|
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
|
||||||
|
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||||
|
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
int4 dst0 = convert_int4_rte(data2); \
|
||||||
|
int4 dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||||
|
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
|
||||||
|
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||||
|
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
dst0 = convert_int4_rte(data2); \
|
||||||
|
dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
coord_out.x += 8; \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||||
|
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
}
|
||||||
|
ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||||
|
ROPE_BSNH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_BSNH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
|
||||||
|
ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||||
|
ROPE_BSNH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_BSNH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||||
|
|
||||||
|
__kernel void rope_F16_F16toF16_bsnh
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
|
||||||
|
vxc_short8 v0, v1, v2, v3, dst;
|
||||||
|
vxc_half8 data0, data1, cos, sin, dst2;
|
||||||
|
VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, cos, v1, 16);
|
||||||
|
VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, sin, v2, 16);
|
||||||
|
|
||||||
|
coord_in.x *= 2;
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
|
||||||
|
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, data0, v0, 16);
|
||||||
|
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, data1, v3, 16);
|
||||||
|
|
||||||
|
int4 coord_out = coord_in;
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
float4 data2, data3, data4, data5;
|
||||||
|
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
|
||||||
|
data2 = data2 - data4;
|
||||||
|
data3 = data3 + data5;
|
||||||
|
|
||||||
|
half4 dst0;
|
||||||
|
half4 dst1;
|
||||||
|
_viv_asm(CONV_RTE, dst0, data2);
|
||||||
|
_viv_asm(CONV_RTE, dst1, data3);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst2, 16);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
|
||||||
|
data2 = data2 - data4;
|
||||||
|
data3 = data3 + data5;
|
||||||
|
|
||||||
|
_viv_asm(CONV_RTE, dst0, data2);
|
||||||
|
_viv_asm(CONV_RTE, dst1, data3);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst2, 16);
|
||||||
|
coord_out.x += 8;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
_viv_uniform int in0_zp;
|
||||||
|
_viv_uniform int cos_zp;
|
||||||
|
_viv_uniform int sin_zp;
|
||||||
|
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
|
||||||
|
#define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \
|
||||||
|
__kernel void rope_##name##_bsnh \
|
||||||
|
( \
|
||||||
|
__read_only image2d_array_t input, \
|
||||||
|
__read_only image2d_array_t cos_cache, \
|
||||||
|
__read_only image2d_array_t sin_cache, \
|
||||||
|
__write_only image2d_array_t output, \
|
||||||
|
int axis \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||||
|
\
|
||||||
|
dtype data0, data1, dst; \
|
||||||
|
src1_type cos, sin; \
|
||||||
|
copy_type v0, v1; \
|
||||||
|
\
|
||||||
|
VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, cos, v0, 16); \
|
||||||
|
VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, sin, v1, 16); \
|
||||||
|
coord_in.x *= 2; \
|
||||||
|
int8 input_desc; \
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||||
|
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
int4 coord_out = coord_in; \
|
||||||
|
int8 output_desc; \
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||||
|
\
|
||||||
|
float4 l00, l01, cos0, cos1; \
|
||||||
|
float4 l10, l11, sin0, sin1; \
|
||||||
|
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
|
||||||
|
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
|
||||||
|
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
|
||||||
|
float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
int4 dst0 = convert_int4_rte(data2); \
|
||||||
|
int4 dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, \
|
||||||
|
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
|
||||||
|
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
|
||||||
|
data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
|
||||||
|
data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
dst0 = convert_int4_rte(data2); \
|
||||||
|
dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
coord_out.x += 8; \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, \
|
||||||
|
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
}
|
||||||
|
ROPE_ASYM_BSNH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
|
||||||
|
ROPE_ASYM_BSNH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
|
||||||
|
ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
|
||||||
|
ROPE_ASYM_BSNH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_ASYM_BSNH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||||
|
ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
|
||||||
|
|
||||||
|
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||||
|
__kernel void rope_BF16_BF16toBF16_bsnh
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
|
||||||
|
vxc_ushort8 v0, v1, v2, v3, dst;
|
||||||
|
VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
coord_in.x *= 2;
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
int4 coord_out = coord_in;
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
|
||||||
|
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
vxc_short8 data;
|
||||||
|
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, src0, data, 16);
|
||||||
|
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, src1, data, 16);
|
||||||
|
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, cos0, data, 16);
|
||||||
|
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, cos1, data, 16);
|
||||||
|
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, sin0, data, 16);
|
||||||
|
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, sin1, data, 16);
|
||||||
|
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, src2, data, 16);
|
||||||
|
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, src3, data, 16);
|
||||||
|
|
||||||
|
float4 even = (float4)(src0.xz, src1.xz);
|
||||||
|
float4 odd = (float4)(src0.yw, src1.yw);
|
||||||
|
float4 data0 = even * cos0 - odd * sin0;
|
||||||
|
float4 data1 = even * sin0 + odd * cos0;
|
||||||
|
|
||||||
|
_viv_asm(COPY, v0, data0, 16);
|
||||||
|
_viv_asm(COPY, v1, data1, 16);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
even = (float4)(src2.xz, src3.xz);
|
||||||
|
odd = (float4)(src2.yw, src3.yw);
|
||||||
|
data0 = even * cos1 - odd * sin1;
|
||||||
|
data1 = even * sin1 + odd * cos1;
|
||||||
|
|
||||||
|
_viv_asm(COPY, v0, data0, 16);
|
||||||
|
_viv_asm(COPY, v1, data1, 16);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||||
|
coord_out.x += 8;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,312 @@
|
||||||
|
#include "cl_viv_vx_ext.h"
|
||||||
|
|
||||||
|
_viv_uniform float scale0;
|
||||||
|
_viv_uniform float scale1;
|
||||||
|
_viv_uniform float output_zp;
|
||||||
|
_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||||
|
|
||||||
|
#define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
|
||||||
|
__kernel void rope_##name##_bnsh \
|
||||||
|
( \
|
||||||
|
__read_only image2d_array_t input, \
|
||||||
|
__read_only image2d_array_t cos_cache, \
|
||||||
|
__read_only image2d_array_t sin_cache, \
|
||||||
|
__write_only image2d_array_t output, \
|
||||||
|
int axis \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||||
|
\
|
||||||
|
src_type data0, data1; \
|
||||||
|
src1_type cos, sin; \
|
||||||
|
copy_type v0, v1; \
|
||||||
|
dst_type dst; \
|
||||||
|
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, cos, v0, 16); \
|
||||||
|
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, sin, v1, 16); \
|
||||||
|
\
|
||||||
|
coord_in.x *= 2; \
|
||||||
|
int8 input_desc; \
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||||
|
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
int4 coord_out = coord_in; \
|
||||||
|
int8 output_desc; \
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||||
|
\
|
||||||
|
float4 data2, data3, data4, data5; \
|
||||||
|
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
|
||||||
|
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
|
||||||
|
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||||
|
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
int4 dst0 = convert_int4_rte(data2); \
|
||||||
|
int4 dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||||
|
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
|
||||||
|
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
|
||||||
|
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||||
|
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
dst0 = convert_int4_rte(data2); \
|
||||||
|
dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
coord_out.x += 8; \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||||
|
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
}
|
||||||
|
ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||||
|
ROPE_BNSH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_BNSH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
|
||||||
|
ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||||
|
ROPE_BNSH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_BNSH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||||
|
|
||||||
|
__kernel void rope_F16_F16toF16_bnsh
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
|
||||||
|
vxc_short8 v0, v1, v2, v3, dst;
|
||||||
|
vxc_half8 data0, data1, cos, sin, dst2;
|
||||||
|
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, cos, v1, 16);
|
||||||
|
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, sin, v2, 16);
|
||||||
|
|
||||||
|
coord_in.x *= 2;
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
|
||||||
|
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, data0, v0, 16);
|
||||||
|
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
_viv_asm(COPY, data1, v3, 16);
|
||||||
|
|
||||||
|
int4 coord_out = coord_in;
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
float4 data2, data3, data4, data5;
|
||||||
|
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
|
||||||
|
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
|
||||||
|
data2 = data2 - data4;
|
||||||
|
data3 = data3 + data5;
|
||||||
|
|
||||||
|
half4 dst0;
|
||||||
|
half4 dst1;
|
||||||
|
_viv_asm(CONV_RTE, dst0, data2);
|
||||||
|
_viv_asm(CONV_RTE, dst1, data3);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst2, 16);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
|
||||||
|
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
|
||||||
|
data2 = data2 - data4;
|
||||||
|
data3 = data3 + data5;
|
||||||
|
|
||||||
|
_viv_asm(CONV_RTE, dst0, data2);
|
||||||
|
_viv_asm(CONV_RTE, dst1, data3);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||||
|
_viv_asm(COPY, dst, dst2, 16);
|
||||||
|
coord_out.x += 8;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
_viv_uniform int in0_zp;
|
||||||
|
_viv_uniform int cos_zp;
|
||||||
|
_viv_uniform int sin_zp;
|
||||||
|
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
|
||||||
|
_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
|
||||||
|
#define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \
|
||||||
|
__kernel void rope_##name##_bnsh \
|
||||||
|
( \
|
||||||
|
__read_only image2d_array_t input, \
|
||||||
|
__read_only image2d_array_t cos_cache, \
|
||||||
|
__read_only image2d_array_t sin_cache, \
|
||||||
|
__write_only image2d_array_t output, \
|
||||||
|
int axis \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||||
|
\
|
||||||
|
dtype data0, data1, dst; \
|
||||||
|
src1_type cos, sin; \
|
||||||
|
copy_type v0, v1; \
|
||||||
|
\
|
||||||
|
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, cos, v0, 16); \
|
||||||
|
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
_viv_asm(COPY, sin, v1, 16); \
|
||||||
|
coord_in.x *= 2; \
|
||||||
|
int8 input_desc; \
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||||
|
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
int4 coord_out = coord_in; \
|
||||||
|
int8 output_desc; \
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||||
|
\
|
||||||
|
float4 l00, l01, cos0, cos1; \
|
||||||
|
float4 l10, l11, sin0, sin1; \
|
||||||
|
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
|
||||||
|
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
|
||||||
|
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||||
|
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||||
|
float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
|
||||||
|
float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
int4 dst0 = convert_int4_rte(data2); \
|
||||||
|
int4 dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, \
|
||||||
|
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
\
|
||||||
|
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
|
||||||
|
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
|
||||||
|
data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
|
||||||
|
data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
|
||||||
|
\
|
||||||
|
dst0 = convert_int4_rte(data2); \
|
||||||
|
dst1 = convert_int4_rte(data3); \
|
||||||
|
\
|
||||||
|
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||||
|
coord_out.x += 8; \
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, \
|
||||||
|
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||||
|
}
|
||||||
|
ROPE_ASYM_BNSH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
|
||||||
|
ROPE_ASYM_BNSH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
|
||||||
|
ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
|
||||||
|
ROPE_ASYM_BNSH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
|
||||||
|
ROPE_ASYM_BNSH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||||
|
ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
|
||||||
|
|
||||||
|
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||||
|
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||||
|
__kernel void rope_BF16_BF16toBF16_bnsh
|
||||||
|
(
|
||||||
|
__read_only image2d_array_t input,
|
||||||
|
__read_only image2d_array_t cos_cache,
|
||||||
|
__read_only image2d_array_t sin_cache,
|
||||||
|
__write_only image2d_array_t output,
|
||||||
|
int axis
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||||
|
|
||||||
|
vxc_ushort8 v0, v1, v2, v3, dst;
|
||||||
|
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
coord_in.x *= 2;
|
||||||
|
int8 input_desc;
|
||||||
|
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||||
|
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||||
|
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
|
||||||
|
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
int4 coord_out = coord_in;
|
||||||
|
int8 output_desc;
|
||||||
|
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||||
|
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||||
|
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||||
|
|
||||||
|
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
|
||||||
|
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
vxc_short8 data;
|
||||||
|
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, src0, data, 16);
|
||||||
|
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, src1, data, 16);
|
||||||
|
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, cos0, data, 16);
|
||||||
|
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, cos1, data, 16);
|
||||||
|
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, sin0, data, 16);
|
||||||
|
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, sin1, data, 16);
|
||||||
|
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||||
|
_viv_asm(COPY, src2, data, 16);
|
||||||
|
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||||
|
_viv_asm(COPY, src3, data, 16);
|
||||||
|
|
||||||
|
float4 even = (float4)(src0.xz, src1.xz);
|
||||||
|
float4 odd = (float4)(src0.yw, src1.yw);
|
||||||
|
float4 data0 = even * cos0 - odd * sin0;
|
||||||
|
float4 data1 = even * sin0 + odd * cos0;
|
||||||
|
|
||||||
|
_viv_asm(COPY, v0, data0, 16);
|
||||||
|
_viv_asm(COPY, v1, data1, 16);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
|
||||||
|
even = (float4)(src2.xz, src3.xz);
|
||||||
|
odd = (float4)(src2.yw, src3.yw);
|
||||||
|
data0 = even * cos1 - odd * sin1;
|
||||||
|
data1 = even * sin1 + odd * cos1;
|
||||||
|
|
||||||
|
_viv_asm(COPY, v0, data0, 16);
|
||||||
|
_viv_asm(COPY, v1, data1, 16);
|
||||||
|
|
||||||
|
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||||
|
coord_out.x += 8;
|
||||||
|
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||||
|
}
|
||||||
|
|
@ -93,3 +93,101 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \
|
||||||
}
|
}
|
||||||
SCATTER_ND_UPDATE_COPY2OUT(U8, vxc_uchar16, 1)
|
SCATTER_ND_UPDATE_COPY2OUT(U8, vxc_uchar16, 1)
|
||||||
SCATTER_ND_UPDATE_COPY2OUT(I8, vxc_char16, 1)
|
SCATTER_ND_UPDATE_COPY2OUT(I8, vxc_char16, 1)
|
||||||
|
SCATTER_ND_UPDATE_COPY2OUT(U16, vxc_ushort8, 2)
|
||||||
|
SCATTER_ND_UPDATE_COPY2OUT(I16, vxc_short8, 2)
|
||||||
|
|
||||||
|
#define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \
|
||||||
|
__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \
|
||||||
|
__read_only image2d_t input_ref, \
|
||||||
|
image2d_t temp_ref, \
|
||||||
|
image2d_t output0 \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int gidx = get_global_id(0); \
|
||||||
|
Image img0 = create_image_from_image2d(input_ref, 2); \
|
||||||
|
Image img1 = create_image_from_image2d(temp_ref, 2); \
|
||||||
|
__global data_type* in_ptr = (__global data_type*)img0.ptr; \
|
||||||
|
__global data_type* out_ptr = (__global data_type*)img1.ptr; \
|
||||||
|
data_type src, dst; \
|
||||||
|
src = in_ptr[gidx]; \
|
||||||
|
vxc_ushort8 mp0; \
|
||||||
|
_viv_asm(COPY, mp0, multAndoutZP0, 16); \
|
||||||
|
VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
|
||||||
|
uniU8MulAndPostShift0_Lo_2x8); \
|
||||||
|
out_ptr[gidx] = dst; \
|
||||||
|
}
|
||||||
|
SCATTER_ND_UPDATE_REF2OUT_16BITS(U16, vxc_ushort8)
|
||||||
|
SCATTER_ND_UPDATE_REF2OUT_16BITS(I16, vxc_short8)
|
||||||
|
|
||||||
|
#define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \
|
||||||
|
__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \
|
||||||
|
__read_only image2d_t input_index, \
|
||||||
|
__read_only image2d_t input_update, \
|
||||||
|
image2d_t temp_ref, \
|
||||||
|
image2d_t input0, \
|
||||||
|
image2d_t output1, \
|
||||||
|
int width, int area, int vol, int coord_dim \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
int gidx = get_global_id(0); \
|
||||||
|
int gidy = get_global_id(1); \
|
||||||
|
\
|
||||||
|
Image img1 = create_image_from_image2d(input_index, 4); \
|
||||||
|
Image img2 = create_image_from_image2d(input_update, 2); \
|
||||||
|
Image img3 = create_image_from_image2d(temp_ref, 2); \
|
||||||
|
__global int* index_ptr = (__global int*)img1.ptr; \
|
||||||
|
__global data_type* update_ptr = (__global data_type*)img2.ptr; \
|
||||||
|
__global data_type* output_ptr = (__global data_type*)img3.ptr; \
|
||||||
|
data_type dst; \
|
||||||
|
\
|
||||||
|
int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \
|
||||||
|
data_type src = update_ptr[gidy * update_width + gidx]; \
|
||||||
|
int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
|
||||||
|
int loc = idx * output_width + gidx; \
|
||||||
|
vxc_ushort8 mp1; \
|
||||||
|
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
|
||||||
|
VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
|
||||||
|
uniU8MulAndPostShift1_Lo_2x8); \
|
||||||
|
output_ptr[loc] = dst; \
|
||||||
|
}
|
||||||
|
SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16, vxc_ushort8)
|
||||||
|
SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16, vxc_short8)
|
||||||
|
|
||||||
|
__kernel void scatter_nd_update_ref2out_F16toF16(
|
||||||
|
__read_only image2d_t input_ref,
|
||||||
|
image2d_t temp_ref,
|
||||||
|
image2d_t output0
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int gidx = get_global_id(0);
|
||||||
|
Image img0 = create_image_from_image2d(input_ref, 2);
|
||||||
|
Image img1 = create_image_from_image2d(temp_ref, 2);
|
||||||
|
__global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr;
|
||||||
|
__global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr;
|
||||||
|
out_ptr[gidx] = in_ptr[gidx];
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void scatter_nd_update_update2ref_F16toF16_16x(
|
||||||
|
__read_only image2d_t input_index,
|
||||||
|
__read_only image2d_t input_update,
|
||||||
|
image2d_t temp_ref,
|
||||||
|
image2d_t input0,
|
||||||
|
image2d_t output1,
|
||||||
|
int width, int area, int vol, int coord_dim
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int gidx = get_global_id(0);
|
||||||
|
int gidy = get_global_id(1);
|
||||||
|
|
||||||
|
Image img1 = create_image_from_image2d(input_index, 4);
|
||||||
|
Image img2 = create_image_from_image2d(input_update, 2);
|
||||||
|
Image img3 = create_image_from_image2d(temp_ref, 2);
|
||||||
|
__global int* index_ptr = (__global int*)img1.ptr;
|
||||||
|
__global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr;
|
||||||
|
__global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr;
|
||||||
|
|
||||||
|
int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx);
|
||||||
|
int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;
|
||||||
|
int loc = idx * output_width + gidx;
|
||||||
|
output_ptr[loc] = update_ptr[gidy * update_width + gidx];
|
||||||
|
}
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -29,6 +29,7 @@
|
||||||
#include "VX/vx_ext_program.h"
|
#include "VX/vx_ext_program.h"
|
||||||
#include "vsi_nn_platform.h"
|
#include "vsi_nn_platform.h"
|
||||||
#include "vsi_nn_prv.h"
|
#include "vsi_nn_prv.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
#include "libnnext/vsi_nn_vxkernel.h"
|
#include "libnnext/vsi_nn_vxkernel.h"
|
||||||
#include "kernel/vsi_nn_kernel.h"
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
|
@ -198,10 +199,11 @@ static vsi_status vsi_nn_RegisterVXKernel
|
||||||
vx_size * program_len = NULL;
|
vx_size * program_len = NULL;
|
||||||
const char **program_src = NULL;
|
const char **program_src = NULL;
|
||||||
vx_context ctx = NULL;
|
vx_context ctx = NULL;
|
||||||
vsi_nn_context_t context = NULL;
|
|
||||||
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
|
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
|
||||||
uint8_t i = 0;
|
uint8_t i = 0;
|
||||||
vsi_bool load_from_file = FALSE;
|
vsi_bool load_from_file = FALSE;
|
||||||
|
vsi_nn_runtime_option_t* options;
|
||||||
|
options = ((vsi_nn_graph_prv_t*)graph)->options;
|
||||||
|
|
||||||
#define MAX_BUILDPROGRAM_LEN 128
|
#define MAX_BUILDPROGRAM_LEN 128
|
||||||
char cmd[MAX_BUILDPROGRAM_LEN] = {0};
|
char cmd[MAX_BUILDPROGRAM_LEN] = {0};
|
||||||
|
|
@ -210,8 +212,7 @@ static vsi_status vsi_nn_RegisterVXKernel
|
||||||
memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN);
|
memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN);
|
||||||
status = VSI_FAILURE;
|
status = VSI_FAILURE;
|
||||||
ctx = vxGetContext( (vx_reference)graph->g );
|
ctx = vxGetContext( (vx_reference)graph->g );
|
||||||
context = graph->ctx;
|
evis = options->config.evis.ver;
|
||||||
evis = context->config.evis.ver;
|
|
||||||
|
|
||||||
program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
|
program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
|
||||||
CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
|
CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
|
||||||
|
|
@ -244,12 +245,12 @@ static vsi_status vsi_nn_RegisterVXKernel
|
||||||
{
|
{
|
||||||
// set default evis version is 2
|
// set default evis version is 2
|
||||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
||||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
|
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
||||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
|
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
|
||||||
}
|
}
|
||||||
status = vxBuildProgram(program, cmd);
|
status = vxBuildProgram(program, cmd);
|
||||||
|
|
||||||
|
|
@ -302,7 +303,7 @@ static vsi_status vsi_nn_RegisterBinKernel
|
||||||
vx_size program_len = 0;
|
vx_size program_len = 0;
|
||||||
const uint8_t *program_ptr = NULL;
|
const uint8_t *program_ptr = NULL;
|
||||||
vx_context ctx;
|
vx_context ctx;
|
||||||
vsi_nn_context_t context;
|
vsi_nn_runtime_option_t* options;
|
||||||
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
|
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
|
||||||
|
|
||||||
#define MAX_BUILDPROGRAM_LEN 128
|
#define MAX_BUILDPROGRAM_LEN 128
|
||||||
|
|
@ -313,8 +314,8 @@ static vsi_status vsi_nn_RegisterBinKernel
|
||||||
status = VSI_FAILURE;
|
status = VSI_FAILURE;
|
||||||
|
|
||||||
ctx = vxGetContext( (vx_reference)graph->g );
|
ctx = vxGetContext( (vx_reference)graph->g );
|
||||||
context = graph->ctx;
|
options = ((vsi_nn_graph_prv_t*)graph)->options;
|
||||||
evis = context->config.evis.ver;
|
evis = options->config.evis.ver;
|
||||||
|
|
||||||
program_ptr = vsi_nn_VxBinResourceGetResource(
|
program_ptr = vsi_nn_VxBinResourceGetResource(
|
||||||
kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
|
kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
|
||||||
|
|
@ -337,12 +338,12 @@ static vsi_status vsi_nn_RegisterBinKernel
|
||||||
{
|
{
|
||||||
// set default evis version is 2
|
// set default evis version is 2
|
||||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
||||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
|
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
||||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
|
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension");
|
snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension");
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,8 @@
|
||||||
#include "utils/vsi_nn_constraint_check.h"
|
#include "utils/vsi_nn_constraint_check.h"
|
||||||
#include "kernel/vsi_nn_kernel.h"
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||||
|
#include "vsi_nn_tensor_util_prv.h"
|
||||||
|
#include "vsi_nn_error.h"
|
||||||
|
|
||||||
static vsi_status _try_set_high_presision_tensor
|
static vsi_status _try_set_high_presision_tensor
|
||||||
(
|
(
|
||||||
|
|
@ -120,9 +122,22 @@ static vsi_status _static_batchnorm
|
||||||
vsi_nn_tensor_t ** outputs
|
vsi_nn_tensor_t ** outputs
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
#define _TENSOR_LEN 64
|
||||||
vsi_status status;
|
vsi_status status;
|
||||||
vsi_nn_kernel_param_t * param = NULL;
|
vsi_nn_kernel_param_t * param = NULL;
|
||||||
vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
|
vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
|
||||||
|
vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
|
||||||
|
uint32_t new_rank = 4;
|
||||||
|
vsi_nn_tensor_t* input0 = NULL;
|
||||||
|
vsi_nn_tensor_t* output = NULL;
|
||||||
|
char reshape0_tensor_name[_TENSOR_LEN];
|
||||||
|
char reshape1_tensor_name[_TENSOR_LEN];
|
||||||
|
char batch_norm_tensor_name[_TENSOR_LEN];
|
||||||
|
|
||||||
|
memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
|
||||||
|
memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
|
||||||
|
memset(batch_norm_tensor_name, 0, sizeof(batch_norm_tensor_name));
|
||||||
|
|
||||||
status = VSI_FAILURE;
|
status = VSI_FAILURE;
|
||||||
|
|
||||||
status = _try_set_high_presision_tensor(inputs);
|
status = _try_set_high_presision_tensor(inputs);
|
||||||
|
|
@ -131,10 +146,43 @@ static vsi_status _static_batchnorm
|
||||||
VSILOGE("Set tensor attr of high presision fail");
|
VSILOGE("Set tensor attr of high presision fail");
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
if(_require_reshape(self, inputs))
|
if (_require_reshape(self, inputs))
|
||||||
{
|
{
|
||||||
reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input;
|
if (3 == inputs[0]->attr.dim_num)
|
||||||
reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output;
|
{
|
||||||
|
shape[0] = inputs[0]->attr.size[0];
|
||||||
|
shape[1] = 1;
|
||||||
|
shape[2] = inputs[0]->attr.size[1];
|
||||||
|
shape[3] = inputs[0]->attr.size[2];
|
||||||
|
}
|
||||||
|
else if (5 == inputs[0]->attr.dim_num)
|
||||||
|
{
|
||||||
|
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
|
||||||
|
shape[1] = inputs[0]->attr.size[2];
|
||||||
|
shape[2] = inputs[0]->attr.size[3];
|
||||||
|
shape[3] = inputs[0]->attr.size[4];
|
||||||
|
}
|
||||||
|
|
||||||
|
input0 = vsi_nn_kernel_insert_reshape_node(self->graph,
|
||||||
|
inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD);
|
||||||
|
CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
|
||||||
|
reshape_tensors[0] = input0;
|
||||||
|
snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
|
||||||
|
if (vxSetReferenceName((vx_reference)reshape_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
|
||||||
|
{
|
||||||
|
VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
|
||||||
|
goto final;
|
||||||
|
}
|
||||||
|
output = vsi_nn_kernel_insert_reshape_node(self->graph,
|
||||||
|
outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD);
|
||||||
|
CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
|
||||||
|
reshape_tensors[5] = output;
|
||||||
|
snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
|
||||||
|
if (vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
|
||||||
|
{
|
||||||
|
VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
|
||||||
|
goto final;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
@ -155,12 +203,26 @@ static vsi_status _static_batchnorm
|
||||||
reshape_tensors, 5,
|
reshape_tensors, 5,
|
||||||
&reshape_tensors[5], 1, param );
|
&reshape_tensors[5], 1, param );
|
||||||
|
|
||||||
if( self->n )
|
if ( self->n )
|
||||||
{
|
{
|
||||||
status = VSI_SUCCESS;
|
status = VSI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
vsi_nn_kernel_param_release( ¶m );
|
vsi_nn_kernel_param_release(¶m);
|
||||||
|
|
||||||
|
if (output)
|
||||||
|
{
|
||||||
|
snprintf(batch_norm_tensor_name, sizeof(batch_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
|
||||||
|
if (vxSetReferenceName((vx_reference)output->t, batch_norm_tensor_name) == VSI_FAILURE)
|
||||||
|
{
|
||||||
|
VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
|
||||||
|
goto final;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final:
|
||||||
|
vsi_safe_release_tensor(input0);
|
||||||
|
vsi_safe_release_tensor(output);
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
@ -313,68 +375,6 @@ static vsi_status op_compute
|
||||||
return status;
|
return status;
|
||||||
} /* op_compute() */
|
} /* op_compute() */
|
||||||
|
|
||||||
static vsi_status op_optimize
|
|
||||||
(
|
|
||||||
vsi_nn_node_t * self,
|
|
||||||
vsi_nn_tensor_t ** inputs,
|
|
||||||
vsi_nn_tensor_t ** outputs,
|
|
||||||
vsi_nn_opt_direction_e direction
|
|
||||||
)
|
|
||||||
{
|
|
||||||
uint32_t dim = 0;
|
|
||||||
vsi_nn_batcnnorm_lcl_data *local = NULL;
|
|
||||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
|
|
||||||
char tensor_name[128];
|
|
||||||
|
|
||||||
dim = inputs[0]->attr.dim_num;
|
|
||||||
if(_require_reshape(self, inputs) == FALSE)
|
|
||||||
{
|
|
||||||
return VSI_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
|
|
||||||
/*
|
|
||||||
reshape 3d input (xcn) --> 4d input (whcn)
|
|
||||||
reshape 3d output(xcn) --> 4d output(whcn)
|
|
||||||
*/
|
|
||||||
dim = 4;
|
|
||||||
if (3 == inputs[0]->attr.dim_num)
|
|
||||||
{
|
|
||||||
shape[0] = inputs[0]->attr.size[0];
|
|
||||||
shape[1] = 1;
|
|
||||||
shape[2] = inputs[0]->attr.size[1];
|
|
||||||
shape[3] = inputs[0]->attr.size[2];
|
|
||||||
}
|
|
||||||
else if (5 == inputs[0]->attr.dim_num)
|
|
||||||
{
|
|
||||||
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
|
|
||||||
shape[1] = inputs[0]->attr.size[2];
|
|
||||||
shape[2] = inputs[0]->attr.size[3];
|
|
||||||
shape[3] = inputs[0]->attr.size[4];
|
|
||||||
}
|
|
||||||
local = self->nn_param.batch_norm.local;
|
|
||||||
if (VSI_NN_OPTIMIZE_BACKWARD == direction)
|
|
||||||
{
|
|
||||||
local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
|
|
||||||
if(local->reshaped_output && local->reshaped_output->t)
|
|
||||||
{
|
|
||||||
memset(tensor_name, 0, sizeof(tensor_name));
|
|
||||||
snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
|
|
||||||
if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
|
|
||||||
{
|
|
||||||
VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
|
|
||||||
return VSI_FAILURE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return VSI_SUCCESS;
|
|
||||||
} /* op_optimize() */
|
|
||||||
|
|
||||||
static vsi_bool _dynamic_check
|
static vsi_bool _dynamic_check
|
||||||
(
|
(
|
||||||
vsi_nn_node_t * self,
|
vsi_nn_node_t * self,
|
||||||
|
|
@ -494,58 +494,6 @@ static vsi_bool op_check
|
||||||
}
|
}
|
||||||
} /* op_check() */
|
} /* op_check() */
|
||||||
|
|
||||||
static vsi_bool op_setup
|
|
||||||
(
|
|
||||||
vsi_nn_node_t * self,
|
|
||||||
vsi_nn_tensor_t ** inputs,
|
|
||||||
vsi_nn_tensor_t ** outputs
|
|
||||||
)
|
|
||||||
{
|
|
||||||
vsi_nn_batcnnorm_lcl_data *local = NULL;
|
|
||||||
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
|
||||||
{
|
|
||||||
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
|
|
||||||
memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
|
|
||||||
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
|
|
||||||
}
|
|
||||||
|
|
||||||
if(_require_reshape(self, inputs))
|
|
||||||
{
|
|
||||||
local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data));
|
|
||||||
if(NULL == local)
|
|
||||||
{
|
|
||||||
return VSI_FAILURE;
|
|
||||||
}
|
|
||||||
memset(local, 0, sizeof(vsi_nn_batcnnorm_lcl_data));
|
|
||||||
self->nn_param.batch_norm.local = local;
|
|
||||||
}
|
|
||||||
return TRUE;
|
|
||||||
} /* op_setup() */
|
|
||||||
|
|
||||||
static vsi_status op_deinit
|
|
||||||
(
|
|
||||||
vsi_nn_node_t * self
|
|
||||||
)
|
|
||||||
{
|
|
||||||
vsi_nn_batch_norm_param *p = &(self->nn_param.batch_norm);
|
|
||||||
if(p->local)
|
|
||||||
{
|
|
||||||
if (p->local->reshaped_input)
|
|
||||||
{
|
|
||||||
vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
|
|
||||||
p->local->reshaped_input = NULL;
|
|
||||||
}
|
|
||||||
if (p->local->reshaped_output)
|
|
||||||
{
|
|
||||||
vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
|
|
||||||
p->local->reshaped_output = NULL;
|
|
||||||
}
|
|
||||||
vsi_nn_safe_free(p->local);
|
|
||||||
}
|
|
||||||
vsi_nn_op_common_deinit(self);
|
|
||||||
return VSI_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -555,10 +503,10 @@ DEF_OP_REG
|
||||||
/* op_name */ BATCH_NORM,
|
/* op_name */ BATCH_NORM,
|
||||||
/* init */ NULL,
|
/* init */ NULL,
|
||||||
/* compute */ op_compute,
|
/* compute */ op_compute,
|
||||||
/* deinit */ op_deinit,
|
/* deinit */ vsi_nn_op_common_deinit,
|
||||||
/* check */ op_check,
|
/* check */ op_check,
|
||||||
/* setup */ op_setup,
|
/* setup */ vsi_nn_op_common_setup,
|
||||||
/* optimize */ op_optimize,
|
/* optimize */ NULL,
|
||||||
/* input_num */ 5,
|
/* input_num */ 5,
|
||||||
/* output_num */ 1
|
/* output_num */ 1
|
||||||
);
|
);
|
||||||
|
|
|
||||||
|
|
@ -118,6 +118,7 @@ static vsi_bool op_setup
|
||||||
if (outputs[0]->attr.dim_num == 0)
|
if (outputs[0]->attr.dim_num == 0)
|
||||||
{
|
{
|
||||||
outputs[0]->attr.size[0] = 1;
|
outputs[0]->attr.size[0] = 1;
|
||||||
|
outputs[0]->attr.dim_num = 1;
|
||||||
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
||||||
|
|
@ -82,6 +82,7 @@ static vsi_bool op_check
|
||||||
{
|
{
|
||||||
BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1)
|
BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1)
|
||||||
IO_TYPE(D_U32, D_U32)
|
IO_TYPE(D_U32, D_U32)
|
||||||
|
IO_TYPE(D_I32, D_I32)
|
||||||
IO_TYPE(D_F32, D_F32)
|
IO_TYPE(D_F32, D_F32)
|
||||||
IO_TYPE(D_F16, D_F16)
|
IO_TYPE(D_F16, D_F16)
|
||||||
IO_TYPE(D_BF16, D_BF16)
|
IO_TYPE(D_BF16, D_BF16)
|
||||||
|
|
|
||||||
|
|
@ -253,6 +253,7 @@ static vsi_bool op_check
|
||||||
IO_TYPE(D_BOOL8, D_I32)
|
IO_TYPE(D_BOOL8, D_I32)
|
||||||
IO_TYPE(D_BOOL8, D_U16)
|
IO_TYPE(D_BOOL8, D_U16)
|
||||||
IO_TYPE(D_BOOL8, D_U32)
|
IO_TYPE(D_BOOL8, D_U32)
|
||||||
|
IO_TYPE(D_BOOL8, D_BF16)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_BOOL8)
|
IO_TYPE(D_U8|Q_ASYM, D_BOOL8)
|
||||||
IO_TYPE(D_I8|Q_ASYM, D_BOOL8)
|
IO_TYPE(D_I8|Q_ASYM, D_BOOL8)
|
||||||
IO_TYPE(D_I8|Q_DFP, D_BOOL8)
|
IO_TYPE(D_I8|Q_DFP, D_BOOL8)
|
||||||
|
|
|
||||||
|
|
@ -155,10 +155,10 @@ vsi_bool vsi_nn_op_eltwise_setup
|
||||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||||
vsi_bool ret = TRUE;
|
vsi_bool ret = TRUE;
|
||||||
|
|
||||||
out_rank = inputs[0]->attr.dim_num;
|
out_rank = vsi_nn_get_tensor_dims(inputs[0]);
|
||||||
for ( i = 1; i < self->input.num; i++)
|
for ( i = 1; i < self->input.num; i++)
|
||||||
{
|
{
|
||||||
in2_rank = inputs[i]->attr.dim_num;
|
in2_rank = vsi_nn_get_tensor_dims(inputs[i]);
|
||||||
out_rank = vsi_nn_max( out_rank, in2_rank );
|
out_rank = vsi_nn_max( out_rank, in2_rank );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -166,10 +166,10 @@ vsi_bool vsi_nn_op_eltwise_setup
|
||||||
{
|
{
|
||||||
vsi_size_t sz0, sz1;
|
vsi_size_t sz0, sz1;
|
||||||
|
|
||||||
sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1;
|
sz0 = i < vsi_nn_get_tensor_dims(inputs[0]) ? inputs[0]->attr.size[i] : 1;
|
||||||
for ( j = 1; j < self->input.num; j++)
|
for ( j = 1; j < self->input.num; j++)
|
||||||
{
|
{
|
||||||
sz1 = i < inputs[j]->attr.dim_num ? inputs[j]->attr.size[i] : 1;
|
sz1 = i < vsi_nn_get_tensor_dims(inputs[j]) ? inputs[j]->attr.size[i] : 1;
|
||||||
sz0 = vsi_nn_max( sz0, sz1 );
|
sz0 = vsi_nn_max( sz0, sz1 );
|
||||||
if (sz0 != sz1 && sz0 != 1 && sz1 != 1)
|
if (sz0 != sz1 && sz0 != 1 && sz1 != 1)
|
||||||
{
|
{
|
||||||
|
|
@ -187,11 +187,12 @@ vsi_bool vsi_nn_op_eltwise_setup
|
||||||
{
|
{
|
||||||
outputs[0]->attr.dim_num = out_rank;
|
outputs[0]->attr.dim_num = out_rank;
|
||||||
memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
|
memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
|
||||||
if (out_rank == 1 &&
|
if (vsi_nn_GetTensorIsScalar(inputs[0]) &&
|
||||||
vsi_nn_GetTensorIsScalar(inputs[0]) &&
|
|
||||||
vsi_nn_GetTensorIsScalar(inputs[1]))
|
vsi_nn_GetTensorIsScalar(inputs[1]))
|
||||||
{
|
{
|
||||||
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||||
|
outputs[0]->attr.size[0] = 1;
|
||||||
|
outputs[0]->attr.dim_num = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
||||||
|
|
@ -199,6 +199,7 @@ static vsi_bool op_setup
|
||||||
if (o_rank == 0)
|
if (o_rank == 0)
|
||||||
{
|
{
|
||||||
outputs[0]->attr.size[0] = 1;
|
outputs[0]->attr.size[0] = 1;
|
||||||
|
outputs[0]->attr.dim_num = 1;
|
||||||
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
||||||
|
|
@ -306,6 +306,8 @@ static vsi_bool _op_check
|
||||||
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP)
|
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP)
|
||||||
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM)
|
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM)
|
||||||
IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM)
|
IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM)
|
||||||
|
IO_TYPE(D_U16|Q_ASYM, D_F32, D_F32, D_U16|Q_ASYM)
|
||||||
|
IO_TYPE(D_U16|Q_SYM, D_F32, D_F32, D_U16|Q_SYM)
|
||||||
END_IO_TYPE_DECL(GROUP_NORM)
|
END_IO_TYPE_DECL(GROUP_NORM)
|
||||||
if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
|
if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_platform.h"
|
#include "vsi_nn_platform.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
|
|
@ -197,6 +198,7 @@ static vsi_bool op_setup_default
|
||||||
vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
|
vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
|
||||||
vsi_nn_internal_tensor_t * h_times_r = NULL;
|
vsi_nn_internal_tensor_t * h_times_r = NULL;
|
||||||
vsi_nn_tensor_attr_t attr;
|
vsi_nn_tensor_attr_t attr;
|
||||||
|
vsi_nn_activation_e recurrent_activation = p->recurrent_activation;
|
||||||
|
|
||||||
vsi_nn_internal_init_node_wksp( self );
|
vsi_nn_internal_init_node_wksp( self );
|
||||||
|
|
||||||
|
|
@ -230,7 +232,8 @@ static vsi_bool op_setup_default
|
||||||
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
|
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
|
||||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||||
if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
|
if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
|
||||||
self->graph->ctx->config.support_stream_processor)
|
(((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor &&
|
||||||
|
recurrent_activation == VSI_NN_ACT_SIGMOID))
|
||||||
{
|
{
|
||||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -93,37 +93,15 @@ static vsi_bool op_check
|
||||||
{
|
{
|
||||||
BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
|
BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
|
||||||
IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32)
|
IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32)
|
||||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_F16)
|
|
||||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F16)
|
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F16)
|
||||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
|
|
||||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
|
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
|
||||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_DFP)
|
|
||||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_DFP)
|
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_DFP)
|
||||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
|
|
||||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_ASYM)
|
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_ASYM)
|
||||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_SYM)
|
|
||||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_SYM)
|
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_SYM)
|
||||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_DFP)
|
|
||||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_DFP)
|
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_DFP)
|
||||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
|
|
||||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_ASYM)
|
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_ASYM)
|
||||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_SYM)
|
|
||||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_SYM)
|
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_SYM)
|
||||||
IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_BF16)
|
IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_BF16)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
|
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
|
|
||||||
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_I16|Q_DFP)
|
|
||||||
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
|
|
||||||
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_I16|Q_SYM)
|
|
||||||
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_F16)
|
|
||||||
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
|
|
||||||
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_F16)
|
|
||||||
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_I8|Q_DFP)
|
|
||||||
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
|
|
||||||
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_I8|Q_SYM)
|
|
||||||
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_F16)
|
|
||||||
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
|
|
||||||
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_F16)
|
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
|
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F16)
|
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F16)
|
||||||
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_I16|Q_DFP)
|
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_I16|Q_DFP)
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_platform.h"
|
#include "vsi_nn_platform.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_node.h"
|
#include "vsi_nn_node.h"
|
||||||
|
|
@ -351,7 +352,7 @@ static vsi_bool op_setup
|
||||||
}
|
}
|
||||||
else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
|
else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
|
||||||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
|
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
|
||||||
self->graph->ctx->config.support_stream_processor )
|
((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
|
||||||
{
|
{
|
||||||
vsi_nn_internal_tensor_t* output_tensor = NULL;
|
vsi_nn_internal_tensor_t* output_tensor = NULL;
|
||||||
vsi_nn_internal_tensor_t* reshape_tensor = NULL;
|
vsi_nn_internal_tensor_t* reshape_tensor = NULL;
|
||||||
|
|
|
||||||
|
|
@ -106,7 +106,7 @@ static vsi_bool op_setup
|
||||||
|
|
||||||
vsi_nn_internal_init_node_wksp( self );
|
vsi_nn_internal_init_node_wksp( self );
|
||||||
|
|
||||||
if ( axis != 0 && !self->graph->ctx->config.support_stream_processor)
|
if ( axis != 0 && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
|
||||||
{
|
{
|
||||||
vsi_nn_internal_tensor_t* mean_tensor = NULL;
|
vsi_nn_internal_tensor_t* mean_tensor = NULL;
|
||||||
vsi_nn_internal_tensor_t* vari_tensor = NULL;
|
vsi_nn_internal_tensor_t* vari_tensor = NULL;
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_platform.h"
|
#include "vsi_nn_platform.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
|
|
@ -139,7 +140,7 @@ static vsi_bool op_setup
|
||||||
|
|
||||||
p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL;
|
p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL;
|
||||||
p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL;
|
p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL;
|
||||||
if (self->graph->ctx->config.support_stream_processor)
|
if (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
|
||||||
{
|
{
|
||||||
p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL;
|
p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -100,6 +100,7 @@ static vsi_bool op_check
|
||||||
IO_TYPE(D_I32, D_I16|Q_ASYM)
|
IO_TYPE(D_I32, D_I16|Q_ASYM)
|
||||||
IO_TYPE(D_I32, D_I16|Q_SYM)
|
IO_TYPE(D_I32, D_I16|Q_SYM)
|
||||||
IO_TYPE(D_I32, D_I32)
|
IO_TYPE(D_I32, D_I32)
|
||||||
|
IO_TYPE(D_I32, D_BF16)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM)
|
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM)
|
IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP)
|
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP)
|
||||||
|
|
@ -111,8 +112,10 @@ static vsi_bool op_check
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_BF16)
|
IO_TYPE(D_U8|Q_ASYM, D_BF16)
|
||||||
IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
|
IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
|
||||||
IO_TYPE(D_I8|Q_ASYM, D_F16)
|
IO_TYPE(D_I8|Q_ASYM, D_F16)
|
||||||
|
IO_TYPE(D_I8|Q_ASYM, D_BF16)
|
||||||
IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
|
IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
|
||||||
IO_TYPE(D_I8|Q_DFP, D_F16)
|
IO_TYPE(D_I8|Q_DFP, D_F16)
|
||||||
|
IO_TYPE(D_I8|Q_DFP, D_BF16)
|
||||||
IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP)
|
IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP)
|
||||||
IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM)
|
IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM)
|
||||||
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
|
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
|
||||||
|
|
@ -124,11 +127,14 @@ static vsi_bool op_check
|
||||||
IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
|
IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
|
||||||
IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM)
|
IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM)
|
||||||
IO_TYPE(D_I16|Q_ASYM, D_F16)
|
IO_TYPE(D_I16|Q_ASYM, D_F16)
|
||||||
|
IO_TYPE(D_I16|Q_ASYM, D_BF16)
|
||||||
IO_TYPE(D_I16|Q_ASYM, D_F32)
|
IO_TYPE(D_I16|Q_ASYM, D_F32)
|
||||||
IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM)
|
IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM)
|
||||||
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM)
|
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM)
|
||||||
|
IO_TYPE(D_I16|Q_SYM, D_BF16)
|
||||||
IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM)
|
IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM)
|
||||||
IO_TYPE(D_I8|Q_SYM, D_F16)
|
IO_TYPE(D_I8|Q_SYM, D_F16)
|
||||||
|
IO_TYPE(D_I8|Q_SYM, D_BF16)
|
||||||
IO_TYPE(D_BF16, D_BF16)
|
IO_TYPE(D_BF16, D_BF16)
|
||||||
END_IO_TYPE_DECL(ONE_HOT)
|
END_IO_TYPE_DECL(ONE_HOT)
|
||||||
if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))
|
if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,7 @@
|
||||||
#include "vsi_nn_tensor_util.h"
|
#include "vsi_nn_tensor_util.h"
|
||||||
#include "kernel/vsi_nn_kernel.h"
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
#include "utils/vsi_nn_constraint_check.h"
|
#include "utils/vsi_nn_constraint_check.h"
|
||||||
|
#include "vsi_nn_error.h"
|
||||||
|
|
||||||
#define _INPUT_NUM (1)
|
#define _INPUT_NUM (1)
|
||||||
#define _OUTPUT_NUM (1)
|
#define _OUTPUT_NUM (1)
|
||||||
|
|
@ -50,33 +51,52 @@ static vsi_status op_compute
|
||||||
vsi_status status = VSI_FAILURE;
|
vsi_status status = VSI_FAILURE;
|
||||||
vsi_nn_kernel_param_t * param = NULL;
|
vsi_nn_kernel_param_t * param = NULL;
|
||||||
vsi_nn_kernel_node_t n = NULL;
|
vsi_nn_kernel_node_t n = NULL;
|
||||||
param =vsi_nn_kernel_param_create();
|
vsi_nn_tensor_t* reshape_tensor = NULL;
|
||||||
|
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||||
|
vsi_nn_pre_process_rgb_param* p = NULL;
|
||||||
|
|
||||||
vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb.local.scale_x );
|
memcpy(shape, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t));
|
||||||
vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb.local.scale_y );
|
|
||||||
vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb.rect.left );
|
shape[0] = shape[1] * shape[0];
|
||||||
vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb.rect.top );
|
shape[1] = shape[2];
|
||||||
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean );
|
shape[2] = 1;
|
||||||
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean );
|
|
||||||
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean );
|
reshape_tensor = vsi_nn_reshape_tensor(self->graph,
|
||||||
vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale );
|
inputs[0], shape, inputs[0]->attr.dim_num);
|
||||||
vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale );
|
CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor failed", final);
|
||||||
vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale );
|
|
||||||
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel );
|
p = (vsi_nn_pre_process_rgb_param*)&(self->nn_param.pre_process_rgb);
|
||||||
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm );
|
|
||||||
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy );
|
param = vsi_nn_kernel_param_create();
|
||||||
n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", inputs, 1, outputs, 1, param );
|
|
||||||
if( n != NULL )
|
vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x );
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y );
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "left", p->rect.left );
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "top", p->rect.top );
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean );
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean );
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean );
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale );
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale );
|
||||||
|
vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale );
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel );
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "enable_perm", p->local->enable_perm );
|
||||||
|
vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy );
|
||||||
|
n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", &reshape_tensor, 1, outputs, 1, param );
|
||||||
|
if ( n != NULL )
|
||||||
{
|
{
|
||||||
self->n = (vx_node)n;
|
self->n = (vx_node)n;
|
||||||
status = VSI_SUCCESS;
|
status = VSI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(param != NULL)
|
if (param != NULL)
|
||||||
{
|
{
|
||||||
vsi_nn_kernel_param_release( ¶m );
|
vsi_nn_kernel_param_release( ¶m );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final:
|
||||||
|
vsi_safe_release_tensor(reshape_tensor);
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
} /* op_compute() */
|
} /* op_compute() */
|
||||||
|
|
||||||
|
|
@ -166,35 +186,57 @@ static vsi_bool op_setup
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
self->nn_param.pre_process_rgb.local.enable_perm = FALSE;
|
p->local->enable_perm = FALSE;
|
||||||
|
|
||||||
if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE)
|
if (p->local->enable_perm == FALSE)
|
||||||
{
|
{
|
||||||
p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
|
p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
|
||||||
p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
|
p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
|
p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
|
||||||
p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
|
p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15)));
|
p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
|
||||||
|
|
||||||
return TRUE;
|
return TRUE;
|
||||||
} /* op_setup() */
|
} /* op_setup() */
|
||||||
|
|
||||||
|
static vsi_status op_init
|
||||||
|
(
|
||||||
|
vsi_nn_node_t* self
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_SUCCESS;
|
||||||
|
|
||||||
|
self->nn_param.pre_process_rgb.local =
|
||||||
|
(vsi_nn_pre_process_rgb_lcl_data*)malloc(sizeof(vsi_nn_pre_process_rgb_lcl_data));
|
||||||
|
|
||||||
|
if (NULL == self->nn_param.pre_process_rgb.local)
|
||||||
|
{
|
||||||
|
return VX_ERROR_NO_MEMORY;
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(self->nn_param.pre_process_rgb.local, 0, sizeof(vsi_nn_pre_process_rgb_lcl_data));
|
||||||
|
|
||||||
|
return status;
|
||||||
|
} /* op_init() */
|
||||||
|
|
||||||
static vsi_status op_deinit
|
static vsi_status op_deinit
|
||||||
(
|
(
|
||||||
vsi_nn_node_t * self
|
vsi_nn_node_t * self
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
if (self->nn_param.pre_process_rgb.local.local_tensor != NULL)
|
if (self->nn_param.pre_process_rgb.local->local_tensor != NULL)
|
||||||
{
|
{
|
||||||
vxReleaseTensor(&self->nn_param.pre_process_rgb.local.local_tensor);
|
vxReleaseTensor(&self->nn_param.pre_process_rgb.local->local_tensor);
|
||||||
self->nn_param.pre_process_rgb.local.local_tensor = NULL;
|
self->nn_param.pre_process_rgb.local->local_tensor = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vsi_nn_safe_free(self->nn_param.pre_process_rgb.local);
|
||||||
|
|
||||||
vsi_nn_op_common_deinit(self);
|
vsi_nn_op_common_deinit(self);
|
||||||
|
|
||||||
return VSI_SUCCESS;
|
return VSI_SUCCESS;
|
||||||
|
|
@ -208,7 +250,7 @@ extern "C" {
|
||||||
DEF_OP_REG
|
DEF_OP_REG
|
||||||
(
|
(
|
||||||
/* op_name */ PRE_PROCESS_RGB,
|
/* op_name */ PRE_PROCESS_RGB,
|
||||||
/* init */ NULL,
|
/* init */ op_init,
|
||||||
/* compute */ op_compute,
|
/* compute */ op_compute,
|
||||||
/* deinit */ op_deinit,
|
/* deinit */ op_deinit,
|
||||||
/* check */ op_check,
|
/* check */ op_check,
|
||||||
|
|
|
||||||
|
|
@ -79,7 +79,10 @@ static vsi_status _prelu_op_compute
|
||||||
vsi_status status = VSI_FAILURE;
|
vsi_status status = VSI_FAILURE;
|
||||||
vsi_nn_prelu_param *prelu = &self->nn_param.prelu;
|
vsi_nn_prelu_param *prelu = &self->nn_param.prelu;
|
||||||
vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 };
|
vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 };
|
||||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||||
|
vsi_nn_tensor_t* input0 = NULL;
|
||||||
|
vsi_nn_tensor_t* input1 = NULL;
|
||||||
|
vsi_nn_tensor_t* output = NULL;
|
||||||
vsi_bool one_rank = FALSE;
|
vsi_bool one_rank = FALSE;
|
||||||
vsi_bool is_per_channel_alpha = 0;
|
vsi_bool is_per_channel_alpha = 0;
|
||||||
vsi_size_t alpha_shape = 1;
|
vsi_size_t alpha_shape = 1;
|
||||||
|
|
@ -88,6 +91,7 @@ static vsi_status _prelu_op_compute
|
||||||
uint32_t dims = outputs[0]->attr.dim_num;
|
uint32_t dims = outputs[0]->attr.dim_num;
|
||||||
|
|
||||||
reshape_tensors[0] = inputs[0];
|
reshape_tensors[0] = inputs[0];
|
||||||
|
reshape_tensors[2] = outputs[0];
|
||||||
one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape);
|
one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape);
|
||||||
|
|
||||||
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
|
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
|
||||||
|
|
@ -114,18 +118,23 @@ static vsi_status _prelu_op_compute
|
||||||
dims = inputs[1]->attr.dim_num;
|
dims = inputs[1]->attr.dim_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
|
input1 = vsi_nn_reshape_tensor( self->graph,
|
||||||
inputs[1], (vsi_size_t*)shapes, dims );
|
inputs[1], (vsi_size_t*)shapes, dims );
|
||||||
|
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
|
||||||
|
reshape_tensors[1] = input1;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
|
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
|
||||||
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
|
input1 = vsi_nn_reshape_tensor( self->graph,
|
||||||
inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num );
|
inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num );
|
||||||
|
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
|
||||||
|
reshape_tensors[1] = input1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
uint32_t rank = inputs[0]->attr.dim_num;
|
||||||
dims = inputs[1]->attr.dim_num;
|
dims = inputs[1]->attr.dim_num;
|
||||||
|
|
||||||
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
|
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
|
||||||
|
|
@ -141,9 +150,32 @@ static vsi_status _prelu_op_compute
|
||||||
shapes[1] = 1;
|
shapes[1] = 1;
|
||||||
dims = 2;
|
dims = 2;
|
||||||
}
|
}
|
||||||
|
else if (one_rank && inputs[1]->attr.is_const == TRUE &&
|
||||||
|
alpha_shape == inputs[0]->attr.size[0] &&
|
||||||
|
alpha_shape == inputs[1]->attr.size[0] &&
|
||||||
|
rank < 3)
|
||||||
|
{
|
||||||
|
is_per_channel_alpha = TRUE;
|
||||||
|
shapes[0] = 1;
|
||||||
|
shapes[1] = 1;
|
||||||
|
shapes[2] = alpha_shape;
|
||||||
|
shapes[3] = rank > 1 ? inputs[0]->attr.size[1] : 1;
|
||||||
|
dims = 4;
|
||||||
|
input0 = vsi_nn_reshape_tensor(self->graph, inputs[0], (vsi_size_t*)shapes, dims);
|
||||||
|
CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
|
||||||
|
reshape_tensors[0] = input0;
|
||||||
|
output = vsi_nn_reshape_tensor(self->graph, outputs[0], (vsi_size_t*)shapes, dims);
|
||||||
|
CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
|
||||||
|
reshape_tensors[2] = output;
|
||||||
|
shapes[0] = alpha_shape;
|
||||||
|
shapes[1] = 1;
|
||||||
|
dims = 2;
|
||||||
|
}
|
||||||
|
|
||||||
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
|
input1 = vsi_nn_reshape_tensor( self->graph,
|
||||||
inputs[1], (vsi_size_t*)shapes, dims );
|
inputs[1], (vsi_size_t*)shapes, dims );
|
||||||
|
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
|
||||||
|
reshape_tensors[1] = input1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add params
|
// Add params
|
||||||
|
|
@ -153,15 +185,19 @@ static vsi_status _prelu_op_compute
|
||||||
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
|
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
|
||||||
kernel_name,
|
kernel_name,
|
||||||
&reshape_tensors[0], 2,
|
&reshape_tensors[0], 2,
|
||||||
outputs, 1, param );
|
&reshape_tensors[2], 1, param );
|
||||||
|
|
||||||
vsi_nn_kernel_param_release( ¶m );
|
vsi_nn_kernel_param_release( ¶m );
|
||||||
vsi_nn_ReleaseTensor( &reshape_tensors[1] );
|
if ( self->n )
|
||||||
if( self->n )
|
|
||||||
{
|
{
|
||||||
status = VSI_SUCCESS;
|
status = VSI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final:
|
||||||
|
vsi_safe_release_tensor(input0);
|
||||||
|
vsi_safe_release_tensor(input1);
|
||||||
|
vsi_safe_release_tensor(output);
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
} /* _prelu_op_compute() */
|
} /* _prelu_op_compute() */
|
||||||
|
|
||||||
|
|
@ -211,28 +247,36 @@ static vsi_bool op_check
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
BEGIN_IO_TYPE_DECL(PRELU, 2, 1)
|
BEGIN_IO_TYPE_DECL(PRELU, 2, 1)
|
||||||
IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM)
|
IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM)
|
||||||
IO_TYPE(D_F16, D_F16, D_I16|Q_DFP)
|
IO_TYPE(D_F16, D_F16, D_I16|Q_DFP)
|
||||||
IO_TYPE(D_F16, D_F16, D_I8|Q_DFP)
|
IO_TYPE(D_F16, D_F16, D_I8|Q_DFP)
|
||||||
IO_TYPE(D_F16, D_F16, D_F16)
|
IO_TYPE(D_F16, D_F16, D_F16)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
|
IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
|
IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
|
||||||
IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP)
|
IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP)
|
||||||
IO_TYPE(D_I8|Q_DFP, D_F16, D_F16)
|
IO_TYPE(D_I8|Q_DFP, D_F16, D_F16)
|
||||||
IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
|
IO_TYPE(D_I8|Q_SYM, D_F16, D_I8|Q_SYM)
|
||||||
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
|
IO_TYPE(D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM)
|
||||||
IO_TYPE(D_BF16, D_F16, D_BF16)
|
IO_TYPE(D_I8|Q_SYM, D_F16, D_F16)
|
||||||
IO_TYPE(D_BF16, D_BF16, D_BF16)
|
IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
|
IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
|
||||||
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
|
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
|
||||||
IO_TYPE(D_F32, D_F32, D_F32)
|
IO_TYPE(D_I16|Q_SYM, D_F16, D_I16|Q_SYM)
|
||||||
IO_TYPE(D_I32, D_I32, D_I32)
|
IO_TYPE(D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM)
|
||||||
|
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16)
|
||||||
|
IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16)
|
||||||
|
IO_TYPE(D_BF16, D_F16, D_BF16)
|
||||||
|
IO_TYPE(D_BF16, D_BF16, D_BF16)
|
||||||
|
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
|
||||||
|
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
|
||||||
|
IO_TYPE(D_F32, D_F32, D_F32)
|
||||||
|
IO_TYPE(D_I32, D_I32, D_I32)
|
||||||
|
|
||||||
/* HW 9.0 */
|
/* HW 9.0 */
|
||||||
IO_TYPE(D_F32, D_BF16, D_BF16)
|
IO_TYPE(D_F32, D_BF16, D_BF16)
|
||||||
IO_TYPE(D_BF16, D_BF16, D_F32)
|
IO_TYPE(D_BF16, D_BF16, D_F32)
|
||||||
END_IO_TYPE_DECL(PRELU)
|
END_IO_TYPE_DECL(PRELU)
|
||||||
if(!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
|
if (!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
|
||||||
char* desc = generate_op_io_types_desc(inputs,
|
char* desc = generate_op_io_types_desc(inputs,
|
||||||
self->input.num, outputs, self->output.num);
|
self->input.num, outputs, self->output.num);
|
||||||
VSILOGE("Inputs/Outputs data type not support: %s", desc);
|
VSILOGE("Inputs/Outputs data type not support: %s", desc);
|
||||||
|
|
|
||||||
|
|
@ -162,7 +162,7 @@ static vsi_bool _check_is_sp_supported_type
|
||||||
int32_t * axes = self->nn_param.reduce.local2->axes;
|
int32_t * axes = self->nn_param.reduce.local2->axes;
|
||||||
int32_t axes_num = self->nn_param.reduce.local2->axes_num;
|
int32_t axes_num = self->nn_param.reduce.local2->axes_num;
|
||||||
|
|
||||||
if ( !self->graph->ctx->config.support_stream_processor ||
|
if ( !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
|
||||||
(type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
|
(type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
|
||||||
{
|
{
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
@ -788,7 +788,7 @@ static vsi_bool op_set_reduce_axis(
|
||||||
}
|
}
|
||||||
*out_rank_x = inputs[0]->attr.dim_num;
|
*out_rank_x = inputs[0]->attr.dim_num;
|
||||||
}
|
}
|
||||||
else if (!self->graph->ctx->config.support_stream_processor ||
|
else if (!((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
|
||||||
resolved_dim_count > 2)
|
resolved_dim_count > 2)
|
||||||
{
|
{
|
||||||
optimzation_input_size(
|
optimzation_input_size(
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ static vsi_status op_compute
|
||||||
vx_nn_reshape_params_t reshape_param;
|
vx_nn_reshape_params_t reshape_param;
|
||||||
|
|
||||||
memset(&attr, 0, sizeof(attr));
|
memset(&attr, 0, sizeof(attr));
|
||||||
attr.size[0] = self->nn_param.reshape.dim_num;
|
attr.size[0] = vsi_nn_max(self->nn_param.reshape.dim_num, 1);
|
||||||
attr.dim_num = 1;
|
attr.dim_num = 1;
|
||||||
attr.is_const = TRUE;
|
attr.is_const = TRUE;
|
||||||
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
||||||
|
|
@ -124,17 +124,28 @@ static vsi_bool op_setup
|
||||||
vsi_bool ret = TRUE;
|
vsi_bool ret = TRUE;
|
||||||
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||||
{
|
{
|
||||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
if (self->nn_param.reshape.dim_num == 0 ||
|
||||||
uint32_t i = 0;
|
self->nn_param.reshape.size == NULL
|
||||||
for (i = 0; i < self->nn_param.reshape.dim_num; i++)
|
)
|
||||||
{
|
{
|
||||||
shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
|
outputs[0]->attr.size[0] = 1;
|
||||||
(vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
|
outputs[0]->attr.dim_num = 1;
|
||||||
|
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||||
|
uint32_t i = 0;
|
||||||
|
for (i = 0; i < self->nn_param.reshape.dim_num; i++)
|
||||||
|
{
|
||||||
|
shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
|
||||||
|
(vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
|
||||||
|
}
|
||||||
|
ret = vsi_nn_CalcReshapeTensor(inputs[0],
|
||||||
|
outputs[0],
|
||||||
|
shape,
|
||||||
|
self->nn_param.reshape.dim_num);
|
||||||
}
|
}
|
||||||
ret = vsi_nn_CalcReshapeTensor(inputs[0],
|
|
||||||
outputs[0],
|
|
||||||
shape,
|
|
||||||
self->nn_param.reshape.dim_num);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
|
||||||
|
|
@ -66,7 +66,7 @@ static vsi_status op_compute
|
||||||
}
|
}
|
||||||
|
|
||||||
memset(&attr, 0, sizeof(attr));
|
memset(&attr, 0, sizeof(attr));
|
||||||
attr.size[0] = self->nn_param.reshape2.dim_num;
|
attr.size[0] = vsi_nn_max(self->nn_param.reshape2.dim_num, 1);
|
||||||
attr.dim_num = 1;
|
attr.dim_num = 1;
|
||||||
attr.is_const = TRUE;
|
attr.is_const = TRUE;
|
||||||
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
||||||
|
|
@ -161,13 +161,24 @@ static vsi_bool op_setup
|
||||||
vsi_bool ret = TRUE;
|
vsi_bool ret = TRUE;
|
||||||
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||||
{
|
{
|
||||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
if (self->nn_param.reshape2.dim_num == 0 ||
|
||||||
memcpy(shape, self->nn_param.reshape2.size,
|
self->nn_param.reshape2.size == NULL
|
||||||
sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
|
)
|
||||||
ret = vsi_nn_CalcReshapeTensor(inputs[0],
|
{
|
||||||
outputs[0],
|
outputs[0]->attr.size[0] = 1;
|
||||||
shape,
|
outputs[0]->attr.dim_num = 1;
|
||||||
self->nn_param.reshape2.dim_num);
|
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||||
|
memcpy(shape, self->nn_param.reshape2.size,
|
||||||
|
sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
|
||||||
|
ret = vsi_nn_CalcReshapeTensor(inputs[0],
|
||||||
|
outputs[0],
|
||||||
|
shape,
|
||||||
|
self->nn_param.reshape2.dim_num);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,145 @@
|
||||||
|
/****************************************************************************
|
||||||
|
*
|
||||||
|
* Copyright (c) 2020 Vivante Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_log.h"
|
||||||
|
#include "vsi_nn_node.h"
|
||||||
|
#include "vsi_nn_prv.h"
|
||||||
|
#include "vsi_nn_ops.h"
|
||||||
|
#include "vsi_nn_tensor.h"
|
||||||
|
#include "utils/vsi_nn_util.h"
|
||||||
|
#include "kernel/vsi_nn_kernel.h"
|
||||||
|
#include "utils/vsi_nn_constraint_check.h"
|
||||||
|
#include "utils/vsi_nn_dtype_util.h"
|
||||||
|
#include "vsi_nn_error.h"
|
||||||
|
|
||||||
|
typedef struct _rope_local_data_t {
|
||||||
|
int32_t placeholder;
|
||||||
|
} rope_local_data_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
Declare number of input and output.
|
||||||
|
*/
|
||||||
|
#define _INPUT_NUM (3)
|
||||||
|
#define _OUTPUT_NUM (1)
|
||||||
|
|
||||||
|
static vsi_status op_compute
|
||||||
|
(
|
||||||
|
vsi_nn_node_t * self,
|
||||||
|
vsi_nn_tensor_t ** inputs,
|
||||||
|
vsi_nn_tensor_t ** outputs
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
vsi_nn_kernel_param_t* param = NULL;
|
||||||
|
int32_t axis = self->nn_param.rope.axis;
|
||||||
|
vsi_bool interleaved = self->nn_param.rope.interleaved;
|
||||||
|
|
||||||
|
param = vsi_nn_kernel_param_create();
|
||||||
|
|
||||||
|
vsi_nn_kernel_param_add_int32(param, "axis", axis);
|
||||||
|
vsi_nn_kernel_param_add_int32(param, "interleaved", interleaved);
|
||||||
|
self->n = (vx_node)vsi_nn_kernel_selector(self->graph, "rope",
|
||||||
|
inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param);
|
||||||
|
|
||||||
|
if ( self->n )
|
||||||
|
{
|
||||||
|
status = VSI_SUCCESS;
|
||||||
|
}
|
||||||
|
if (param != NULL)
|
||||||
|
{
|
||||||
|
vsi_nn_kernel_param_release(¶m);
|
||||||
|
}
|
||||||
|
|
||||||
|
return status;
|
||||||
|
} /* op_compute() */
|
||||||
|
|
||||||
|
static vsi_bool op_check
|
||||||
|
(
|
||||||
|
vsi_nn_node_t * self,
|
||||||
|
vsi_nn_tensor_t ** inputs,
|
||||||
|
vsi_nn_tensor_t ** outputs
|
||||||
|
)
|
||||||
|
{
|
||||||
|
BEGIN_IO_TYPE_DECL(ROPE, _INPUT_NUM, _OUTPUT_NUM)
|
||||||
|
IO_TYPE(D_F32, D_F32, D_F32, D_F32)
|
||||||
|
IO_TYPE(D_BF16, D_BF16, D_BF16, D_BF16)
|
||||||
|
IO_TYPE(D_F16, D_F16, D_F16, D_F16)
|
||||||
|
IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP)
|
||||||
|
IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM)
|
||||||
|
IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM)
|
||||||
|
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
|
||||||
|
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
|
||||||
|
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM)
|
||||||
|
IO_TYPE(D_U16|Q_ASYM, D_U16|Q_ASYM, D_U16|Q_ASYM, D_U16|Q_ASYM)
|
||||||
|
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP)
|
||||||
|
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM, D_I8|Q_SYM)
|
||||||
|
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_U8|Q_ASYM)
|
||||||
|
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM, D_U8|Q_ASYM)
|
||||||
|
IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_I8|Q_DFP)
|
||||||
|
IO_TYPE(D_I8|Q_SYM, D_F16, D_F16, D_I8|Q_SYM)
|
||||||
|
IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16, D_I8|Q_ASYM)
|
||||||
|
IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_U8|Q_ASYM)
|
||||||
|
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_I16|Q_DFP)
|
||||||
|
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16, D_I16|Q_SYM)
|
||||||
|
IO_TYPE(D_U16|Q_ASYM, D_F16, D_F16, D_U16|Q_ASYM)
|
||||||
|
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_I8|Q_DFP)
|
||||||
|
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16, D_I8|Q_SYM)
|
||||||
|
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_U8|Q_ASYM)
|
||||||
|
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16, D_U8|Q_ASYM)
|
||||||
|
END_IO_TYPE_DECL(ROPE)
|
||||||
|
if (!VALIDATE_OP_IO_TYPES(ROPE, self, inputs, self->input.num, outputs, self->output.num))
|
||||||
|
{
|
||||||
|
char* desc = generate_op_io_types_desc(inputs,
|
||||||
|
self->input.num, outputs, self->output.num);
|
||||||
|
VSILOGE("Inputs/Outputs data type not support: %s", desc);
|
||||||
|
destroy_op_io_types_desc(desc);
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
} /* op_check() */
|
||||||
|
|
||||||
|
__BEGIN_DECLS
|
||||||
|
|
||||||
|
/* Registrar */
|
||||||
|
DEF_OP_REG
|
||||||
|
(
|
||||||
|
/* op_name */ ROPE,
|
||||||
|
/* init */ NULL,
|
||||||
|
/* compute */ op_compute,
|
||||||
|
/* deinit */ vsi_nn_op_common_deinit,
|
||||||
|
/* check */ op_check,
|
||||||
|
/* setup */ vsi_nn_op_common_setup,
|
||||||
|
/* optimize */ NULL,
|
||||||
|
/* input_num */ _INPUT_NUM,
|
||||||
|
/* output_num */ _OUTPUT_NUM
|
||||||
|
);
|
||||||
|
|
||||||
|
__END_DECLS
|
||||||
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_platform.h"
|
#include "vsi_nn_platform.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_node.h"
|
#include "vsi_nn_node.h"
|
||||||
|
|
@ -188,7 +189,7 @@ static vsi_status op_optimize
|
||||||
}
|
}
|
||||||
if ( _need_split_softmax(self, inputs) == FALSE ||
|
if ( _need_split_softmax(self, inputs) == FALSE ||
|
||||||
self->nn_param.softmax_internal.axis != 0 ||
|
self->nn_param.softmax_internal.axis != 0 ||
|
||||||
self->graph->ctx->config.support_stream_processor )
|
((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
|
||||||
{
|
{
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,10 @@
|
||||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||||
#include "vsi_nn_error.h"
|
#include "vsi_nn_error.h"
|
||||||
|
|
||||||
|
typedef struct _topk_local_data_t {
|
||||||
|
vsi_bool use_internal_node;
|
||||||
|
} topk_local_data_t;
|
||||||
|
|
||||||
#define _INPUT_NUM (1)
|
#define _INPUT_NUM (1)
|
||||||
#define _OUTPUT_NUM (2)
|
#define _OUTPUT_NUM (2)
|
||||||
|
|
||||||
|
|
@ -111,19 +115,43 @@ static vsi_status op_compute
|
||||||
vsi_nn_tensor_t * out1_tensor = NULL;
|
vsi_nn_tensor_t * out1_tensor = NULL;
|
||||||
vsi_bool ret = FALSE;
|
vsi_bool ret = FALSE;
|
||||||
|
|
||||||
if (inputs[0]->attr.size[axis] == 1)
|
if (self->nn_param.topk.local->use_internal_node)
|
||||||
{
|
{
|
||||||
return vsi_nn_internal_compute_node( self );
|
return vsi_nn_internal_compute_node( self );
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = vsi_nn_kernel_optimize_softmax_shape(
|
if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH)
|
||||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
{
|
||||||
shapes[0], &rank_in, &new_axis0);
|
int32_t i = 1;
|
||||||
|
|
||||||
ret = vsi_nn_kernel_optimize_softmax_shape(
|
shapes[0][0] = inputs[0]->attr.size[0];
|
||||||
outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
|
shapes[1][0] = outputs[0]->attr.size[0];
|
||||||
shapes[1], &rank_out, &new_axis1);
|
shapes[0][1] = 1;
|
||||||
|
shapes[1][1] = 1;
|
||||||
|
for (i = 1; i < (int32_t)(inputs[0]->attr.dim_num); i++)
|
||||||
|
{
|
||||||
|
shapes[0][1] = shapes[0][1] * inputs[0]->attr.size[i];
|
||||||
|
}
|
||||||
|
for (i = 1; i < (int32_t)(outputs[0]->attr.dim_num); i++)
|
||||||
|
{
|
||||||
|
shapes[1][1] = shapes[1][1] * outputs[0]->attr.size[i];
|
||||||
|
}
|
||||||
|
new_axis0 = axis;
|
||||||
|
new_axis1 = axis;
|
||||||
|
rank_in = 2;
|
||||||
|
rank_out = 2;
|
||||||
|
ret = TRUE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ret = vsi_nn_kernel_optimize_softmax_shape(
|
||||||
|
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||||
|
shapes[0], &rank_in, &new_axis0);
|
||||||
|
|
||||||
|
ret = vsi_nn_kernel_optimize_softmax_shape(
|
||||||
|
outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
|
||||||
|
shapes[1], &rank_out, &new_axis1);
|
||||||
|
}
|
||||||
if (ret)
|
if (ret)
|
||||||
{
|
{
|
||||||
uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
|
uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
|
||||||
|
|
@ -303,10 +331,12 @@ static vsi_bool op_setup
|
||||||
vsi_nn_internal_tensor_t* const0_input = NULL;
|
vsi_nn_internal_tensor_t* const0_input = NULL;
|
||||||
vsi_nn_tensor_attr_t attr;
|
vsi_nn_tensor_attr_t attr;
|
||||||
|
|
||||||
|
p->local->use_internal_node = TRUE;
|
||||||
|
|
||||||
vsi_nn_internal_init_node_wksp(self);
|
vsi_nn_internal_init_node_wksp(self);
|
||||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
|
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
|
||||||
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||||
curr->inputs[0] = inputs[0];
|
curr->inputs[0] = inputs[0];
|
||||||
curr->outputs[0] = outputs[0];
|
curr->outputs[0] = outputs[0];
|
||||||
vsi_nn_internal_setup_node(self, curr);
|
vsi_nn_internal_setup_node(self, curr);
|
||||||
|
|
||||||
|
|
@ -318,10 +348,42 @@ static vsi_bool op_setup
|
||||||
CHECK_PTR_FAIL_GOTO(const0_input, "Create tensor failed", final);
|
CHECK_PTR_FAIL_GOTO(const0_input, "Create tensor failed", final);
|
||||||
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
|
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
|
||||||
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||||
curr->inputs[0] = const0_input->t;
|
curr->inputs[0] = const0_input->t;
|
||||||
curr->outputs[0] = outputs[1];
|
curr->outputs[0] = outputs[1];
|
||||||
vsi_nn_internal_setup_node(self, curr);
|
vsi_nn_internal_setup_node(self, curr);
|
||||||
}
|
}
|
||||||
|
else if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
|
||||||
|
{
|
||||||
|
vsi_nn_internal_node_t* curr = NULL;
|
||||||
|
vsi_nn_internal_tensor_t* temp_tensor = NULL;
|
||||||
|
vsi_nn_tensor_attr_t attr;
|
||||||
|
|
||||||
|
p->local->use_internal_node = TRUE;
|
||||||
|
|
||||||
|
vsi_nn_internal_init_node_wksp(self);
|
||||||
|
|
||||||
|
memcpy(&attr, &inputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
|
||||||
|
attr.dim_num = VSI_NN_DIM_AUTO;
|
||||||
|
attr.vtl = TRUE;
|
||||||
|
attr.is_const = FALSE;
|
||||||
|
temp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
|
||||||
|
CHECK_PTR_FAIL_GOTO(temp_tensor, "Create tensor failed", final);
|
||||||
|
|
||||||
|
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_TOPK, 1, 2);
|
||||||
|
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||||
|
curr->node->nn_param.topk.axis = p->axis;
|
||||||
|
curr->node->nn_param.topk.k = p->k;
|
||||||
|
curr->inputs[0] = inputs[0];
|
||||||
|
curr->outputs[0] = temp_tensor->t;
|
||||||
|
curr->outputs[1] = outputs[1];
|
||||||
|
vsi_nn_internal_setup_node(self, curr);
|
||||||
|
|
||||||
|
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
|
||||||
|
CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
|
||||||
|
curr->inputs[0] = temp_tensor->t;
|
||||||
|
curr->outputs[0] = outputs[0];
|
||||||
|
vsi_nn_internal_setup_node(self, curr);
|
||||||
|
}
|
||||||
|
|
||||||
return TRUE;
|
return TRUE;
|
||||||
final:
|
final:
|
||||||
|
|
@ -341,7 +403,7 @@ static vsi_status op_optimize
|
||||||
VSI_UNREFERENCED(outputs);
|
VSI_UNREFERENCED(outputs);
|
||||||
|
|
||||||
p = &(self->nn_param.topk);
|
p = &(self->nn_param.topk);
|
||||||
if (inputs[0]->attr.size[p->axis] == 1)
|
if (p->local->use_internal_node)
|
||||||
{
|
{
|
||||||
return vsi_nn_internal_optimize_node( self, direction );
|
return vsi_nn_internal_optimize_node( self, direction );
|
||||||
}
|
}
|
||||||
|
|
@ -357,6 +419,14 @@ static vsi_status op_init
|
||||||
vsi_status status = VSI_SUCCESS;
|
vsi_status status = VSI_SUCCESS;
|
||||||
self->nn_param.topk.axis = 0;
|
self->nn_param.topk.axis = 0;
|
||||||
|
|
||||||
|
self->nn_param.topk.local = \
|
||||||
|
(topk_local_data_t*)malloc(sizeof(topk_local_data_t));
|
||||||
|
if (NULL == self->nn_param.topk.local)
|
||||||
|
{
|
||||||
|
return VX_ERROR_NO_MEMORY;
|
||||||
|
}
|
||||||
|
memset(self->nn_param.topk.local, 0, sizeof(topk_local_data_t));
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
} /* op_init() */
|
} /* op_init() */
|
||||||
|
|
||||||
|
|
@ -365,7 +435,12 @@ static vsi_status op_deinit
|
||||||
vsi_nn_node_t * self
|
vsi_nn_node_t * self
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
vsi_nn_internal_deinit_node_wksp(self);
|
if (self->nn_param.topk.local->use_internal_node)
|
||||||
|
{
|
||||||
|
vsi_nn_internal_deinit_node_wksp(self);
|
||||||
|
}
|
||||||
|
|
||||||
|
vsi_nn_safe_free(self->nn_param.topk.local);
|
||||||
vsi_nn_op_common_deinit(self);
|
vsi_nn_op_common_deinit(self);
|
||||||
|
|
||||||
return VSI_SUCCESS;
|
return VSI_SUCCESS;
|
||||||
|
|
|
||||||
|
|
@ -475,6 +475,7 @@ static _op_param_gen_t s_op_gen[] =
|
||||||
/* GROUPED_CONV3D */ NULL,
|
/* GROUPED_CONV3D */ NULL,
|
||||||
/* COL2IM */ NULL,
|
/* COL2IM */ NULL,
|
||||||
/* L1_LAYER_NORM */ NULL,
|
/* L1_LAYER_NORM */ NULL,
|
||||||
|
/* ROPE */ NULL,
|
||||||
};
|
};
|
||||||
_compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
|
_compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -98,7 +98,7 @@ static VSI_INLINE_API void _convert_bfloat16_to_float
|
||||||
uint32_t i;
|
uint32_t i;
|
||||||
for( i = 0; i < size; i ++ )
|
for( i = 0; i < size; i ++ )
|
||||||
{
|
{
|
||||||
out_buffer[i] = bfp16_to_fp32( (int16_t)buffer[i] );
|
out_buffer[i] = bfp16_to_fp32( (uint16_t)buffer[i] );
|
||||||
}
|
}
|
||||||
} /* _convert_bfloat16_to_float */
|
} /* _convert_bfloat16_to_float */
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,7 @@
|
||||||
#include "vsi_nn_prv.h"
|
#include "vsi_nn_prv.h"
|
||||||
#include "vsi_nn_graph.h"
|
#include "vsi_nn_graph.h"
|
||||||
#include "vsi_nn_types.h"
|
#include "vsi_nn_types.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_tensor.h"
|
#include "vsi_nn_tensor.h"
|
||||||
#include "vsi_nn_tensor_util.h"
|
#include "vsi_nn_tensor_util.h"
|
||||||
#include "vsi_nn_log.h"
|
#include "vsi_nn_log.h"
|
||||||
|
|
@ -1261,7 +1262,9 @@ vsi_bool vsi_nn_is_same_quant_type(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
|
#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
|
||||||
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: {
|
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
|
||||||
|
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
|
||||||
|
{
|
||||||
const float diff = (float)1e-5;
|
const float diff = (float)1e-5;
|
||||||
int32_t i = 0;
|
int32_t i = 0;
|
||||||
int32_t scale_cnt0 = src_dtype->group_count;
|
int32_t scale_cnt0 = src_dtype->group_count;
|
||||||
|
|
@ -1627,12 +1630,12 @@ vsi_bool vsi_nn_is_stream_process_supported_types
|
||||||
{
|
{
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
|
|
||||||
if ( graph->ctx->config.support_stream_processor == 0 )
|
if ( ((vsi_nn_graph_prv_t*)graph)->options->config.support_stream_processor == 0 )
|
||||||
{
|
{
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( graph->ctx->config.sp_exec_count == 0 )
|
if ( ((vsi_nn_graph_prv_t*)graph)->options->config.sp_exec_count == 0 )
|
||||||
{
|
{
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
@ -1769,3 +1772,11 @@ typedef enum
|
||||||
|
|
||||||
return support;
|
return support;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t vsi_nn_get_tensor_dims
|
||||||
|
(
|
||||||
|
vsi_nn_tensor_t* tensor
|
||||||
|
)
|
||||||
|
{
|
||||||
|
return vsi_nn_GetTensorIsScalar(tensor) ? 0 : tensor->attr.dim_num;
|
||||||
|
}
|
||||||
|
|
@ -39,6 +39,9 @@ static vsi_status query_hardware_caps
|
||||||
#endif
|
#endif
|
||||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||||
vx_hardware_caps_params_ext_t paramExt;
|
vx_hardware_caps_params_ext_t paramExt;
|
||||||
|
#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
|
||||||
|
vx_hardware_caps_params_ext3_t paramExt3;
|
||||||
|
#endif
|
||||||
|
|
||||||
memset(¶mExt, 0, sizeof(vx_hardware_caps_params_ext_t));
|
memset(¶mExt, 0, sizeof(vx_hardware_caps_params_ext_t));
|
||||||
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt),
|
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt),
|
||||||
|
|
@ -73,6 +76,13 @@ static vsi_status query_hardware_caps
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
|
||||||
|
memset(¶mExt3, 0, sizeof(vx_hardware_caps_params_ext3_t));
|
||||||
|
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt3),
|
||||||
|
sizeof(vx_hardware_caps_params_ext3_t));
|
||||||
|
context->config.support_ffd = paramExt3.supportFixedFunctionDevice;
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if(param.evis1 == TRUE && param.evis2 == FALSE)
|
if(param.evis1 == TRUE && param.evis2 == FALSE)
|
||||||
|
|
@ -93,6 +103,85 @@ final:
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vsi_status query_hardware_caps_runtime
|
||||||
|
(
|
||||||
|
vsi_nn_context_t context,
|
||||||
|
vsi_nn_runtime_option_t* options
|
||||||
|
)
|
||||||
|
{
|
||||||
|
vsi_status status = VSI_FAILURE;
|
||||||
|
vx_hardware_caps_params_t param;
|
||||||
|
VSI_UNREFERENCED(options);
|
||||||
|
memset(&(options->config), 0, sizeof(vsi_nn_hw_config_t));
|
||||||
|
#if VX_STREAM_PROCESSOR_SUPPORT
|
||||||
|
vx_hardware_caps_params_ext2_t paramExt2;
|
||||||
|
#endif
|
||||||
|
#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
|
||||||
|
vx_hardware_caps_params_ext3_t paramExt3;
|
||||||
|
#endif
|
||||||
|
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||||
|
vx_hardware_caps_params_ext_t paramExt;
|
||||||
|
|
||||||
|
memset(¶mExt, 0, sizeof(vx_hardware_caps_params_ext_t));
|
||||||
|
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt),
|
||||||
|
sizeof(vx_hardware_caps_params_ext_t));
|
||||||
|
param.evis1 = paramExt.base.evis1;
|
||||||
|
param.evis2 = paramExt.base.evis2;
|
||||||
|
#else
|
||||||
|
memset(¶m, 0, sizeof(vx_hardware_caps_params_t));
|
||||||
|
status = vxQueryHardwareCaps(context->c, ¶m, sizeof(vx_hardware_caps_params_t));
|
||||||
|
#endif
|
||||||
|
TEST_CHECK_STATUS(status, final);
|
||||||
|
|
||||||
|
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||||
|
options->config.subGroupSize = paramExt.subGroupSize;
|
||||||
|
#ifdef VSI_40BIT_VA_SUPPORT
|
||||||
|
options->config.use_40bits_va = paramExt.supportVA40;
|
||||||
|
#endif
|
||||||
|
#if VX_STREAM_PROCESSOR_SUPPORT
|
||||||
|
memset(¶mExt2, 0, sizeof(vx_hardware_caps_params_ext2_t));
|
||||||
|
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt2),
|
||||||
|
sizeof(vx_hardware_caps_params_ext2_t));
|
||||||
|
if (options->enable_stream_processor)
|
||||||
|
{
|
||||||
|
options->config.support_stream_processor = paramExt.supportStreamProcessor;
|
||||||
|
options->config.sp_exec_count = paramExt2.streamProcessorExecCount;
|
||||||
|
options->config.sp_vector_depth = paramExt2.streamProcessorVectorSize;
|
||||||
|
if (options->config.sp_exec_count > 0)
|
||||||
|
{
|
||||||
|
options->config.sp_per_core_vector_depth =
|
||||||
|
options->config.sp_vector_depth / options->config.sp_exec_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if VX_FIXED_FUNCTION_DEVICE_SUPPORT
|
||||||
|
memset(¶mExt3, 0, sizeof(vx_hardware_caps_params_ext3_t));
|
||||||
|
status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt3),
|
||||||
|
sizeof(vx_hardware_caps_params_ext3_t));
|
||||||
|
options->config.support_ffd = paramExt3.supportFixedFunctionDevice;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if(param.evis1 == TRUE && param.evis2 == FALSE)
|
||||||
|
{
|
||||||
|
options->config.evis.ver = VSI_NN_HW_EVIS_1;
|
||||||
|
}
|
||||||
|
else if(param.evis1 == FALSE && param.evis2 == TRUE)
|
||||||
|
{
|
||||||
|
options->config.evis.ver = VSI_NN_HW_EVIS_2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
options->config.evis.ver = VSI_NN_HW_EVIS_NONE;
|
||||||
|
VSILOGW("Unsupported evis version");
|
||||||
|
}
|
||||||
|
|
||||||
|
final:
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
|
#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
|
||||||
static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
|
static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
|
||||||
static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
|
static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
|
||||||
|
|
@ -153,6 +242,44 @@ vsi_status vsi_nn_initOptions
|
||||||
return VSI_SUCCESS;
|
return VSI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vsi_status vsi_nn_initOptions_runtime
|
||||||
|
(
|
||||||
|
vsi_nn_runtime_option_t *options,
|
||||||
|
vsi_nn_context_t ctx
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int32_t default_value = 1;
|
||||||
|
|
||||||
|
options->enable_shader = vsi_nn_getenv_asint(ENV_ENABLE_SHADER, 1);
|
||||||
|
options->enable_opcheck = vsi_nn_getenv_asint(ENV_ENABLE_OPCHECK, 1);
|
||||||
|
#if (VX_CONCAT_OPT_SUPPORT)
|
||||||
|
default_value = 0;
|
||||||
|
#else
|
||||||
|
default_value = 1;
|
||||||
|
#endif
|
||||||
|
options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value);
|
||||||
|
options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
|
||||||
|
options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1);
|
||||||
|
options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1);
|
||||||
|
options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0);
|
||||||
|
#if (VX_STRIDED_SLICE_OPT_SUPPORT)
|
||||||
|
default_value = 0;
|
||||||
|
#else
|
||||||
|
default_value = 1;
|
||||||
|
#endif
|
||||||
|
options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value);
|
||||||
|
options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0);
|
||||||
|
options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0);
|
||||||
|
options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1);
|
||||||
|
options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1);
|
||||||
|
|
||||||
|
/*init hw params*/
|
||||||
|
options->config = ctx->config;
|
||||||
|
|
||||||
|
return VSI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
vsi_nn_context_t vsi_nn_CreateContext
|
vsi_nn_context_t vsi_nn_CreateContext
|
||||||
( void )
|
( void )
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -1362,7 +1362,7 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
|
||||||
graph->isAllowFastMode = TRUE;
|
graph->isAllowFastMode = TRUE;
|
||||||
vsi_nn_MapInit( graph->node_table );
|
vsi_nn_MapInit( graph->node_table );
|
||||||
vsi_nn_MapInit( graph->tensor_table );
|
vsi_nn_MapInit( graph->tensor_table );
|
||||||
vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options );
|
vsi_nn_initOptions_runtime( ((vsi_nn_graph_prv_t*) graph)->options, ctx );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
@ -3398,6 +3398,7 @@ char* vsi_nn_GetRunTimeVariable
|
||||||
#define varSize 256
|
#define varSize 256
|
||||||
char* value_str = (char*)malloc(sizeof(char) * varSize);
|
char* value_str = (char*)malloc(sizeof(char) * varSize);
|
||||||
CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final);
|
CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final);
|
||||||
|
CHECK_PTR_FAIL_GOTO(graph, "Graph is NULL!", final);
|
||||||
memset(value_str, 0, varSize);
|
memset(value_str, 0, varSize);
|
||||||
char tmp_value[varSize] = {0};
|
char tmp_value[varSize] = {0};
|
||||||
VSI_UNREFERENCED(tmp_value);
|
VSI_UNREFERENCED(tmp_value);
|
||||||
|
|
@ -3502,6 +3503,8 @@ vsi_status vsi_nn_SetRunTimeVariable
|
||||||
break;
|
break;
|
||||||
case VSI_VX_ENABLE_STREAM_PROCESSOR:
|
case VSI_VX_ENABLE_STREAM_PROCESSOR:
|
||||||
options->enable_stream_processor = atoi(value);
|
options->enable_stream_processor = atoi(value);
|
||||||
|
options->config.support_stream_processor = atoi(value);
|
||||||
|
status = query_hardware_caps_runtime(graph->ctx, options);
|
||||||
break;
|
break;
|
||||||
case VSI_VX_ENABLE_BATCH_OPT:
|
case VSI_VX_ENABLE_BATCH_OPT:
|
||||||
options->enable_batch_opt = atoi(value);
|
options->enable_batch_opt = atoi(value);
|
||||||
|
|
|
||||||
|
|
@ -895,10 +895,13 @@ static void _convert_const_I8toU8
|
||||||
attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
|
attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
|
||||||
attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
|
attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
|
||||||
attr->dtype.zero_point += 128;
|
attr->dtype.zero_point += 128;
|
||||||
|
if (tensor->t) vxReleaseTensor(&tensor->t);
|
||||||
if ( tensor->t ) vxReleaseTensor(&tensor->t);
|
|
||||||
tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr);
|
tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr);
|
||||||
|
#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
|
||||||
|
int32_t is_sparsity = 0;
|
||||||
|
is_sparsity = vsi_nn_GetTensorIsSparsity(tensor);
|
||||||
|
vsi_nn_SetTensorIsSparsity(tensor, is_sparsity);
|
||||||
|
#endif
|
||||||
final:
|
final:
|
||||||
vsi_nn_safe_free( data );
|
vsi_nn_safe_free( data );
|
||||||
}/* _convert_const_I8toU8() */
|
}/* _convert_const_I8toU8() */
|
||||||
|
|
|
||||||
|
|
@ -247,7 +247,8 @@ static void _set_preproc_node_input_attr
|
||||||
vsi_nn_tensor_attr_t* attr,
|
vsi_nn_tensor_attr_t* attr,
|
||||||
vsi_nn_preprocess_image_size_t* input_size,
|
vsi_nn_preprocess_image_size_t* input_size,
|
||||||
vsi_nn_preprocess_source_format_e* source_format,
|
vsi_nn_preprocess_source_format_e* source_format,
|
||||||
vsi_nn_preprocess_source_layout_e* source_layout
|
vsi_nn_preprocess_source_layout_e* source_layout,
|
||||||
|
vsi_nn_preprocess_dtype_convert_t* data_convert
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
*input_attr = *attr;
|
*input_attr = *attr;
|
||||||
|
|
@ -266,26 +267,33 @@ static void _set_preproc_node_input_attr
|
||||||
}
|
}
|
||||||
if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR)
|
if(*source_format == VSI_NN_SOURCE_FORMAT_TENSOR)
|
||||||
{
|
{
|
||||||
input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
if(data_convert != NULL)
|
||||||
input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
{
|
||||||
|
input_attr->dtype = data_convert->dtype;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||||
|
input_attr->dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
input_attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||||
input_attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
|
input_attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
|
||||||
}
|
}
|
||||||
if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB)
|
if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB)
|
||||||
{
|
{
|
||||||
if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC)
|
if (*source_layout == VSI_NN_SOURCE_LAYOUT_NCHW)
|
||||||
{
|
{
|
||||||
input_attr->size[0] = input_attr->size[1]*input_attr->size[0];
|
vsi_size_t channel = input_attr->size[2];
|
||||||
input_attr->size[1] = input_attr->size[2];
|
if (channel != 3)
|
||||||
input_attr->size[2] = 1;
|
{
|
||||||
}
|
VSILOGE("RGB chanel must be 3, please have a check!");
|
||||||
else
|
}
|
||||||
{
|
input_attr->size[2] = input_attr->size[1];
|
||||||
input_attr->size[0] = input_attr->size[2]*input_attr->size[0];
|
input_attr->size[1] = input_attr->size[0];
|
||||||
input_attr->size[2] = 1;
|
input_attr->size[0] = channel;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -333,15 +341,10 @@ static void _set_preproc_node_input_attr
|
||||||
static void _set_preproc_node_output_attr
|
static void _set_preproc_node_output_attr
|
||||||
(
|
(
|
||||||
vsi_nn_tensor_attr_t* output_attr,
|
vsi_nn_tensor_attr_t* output_attr,
|
||||||
vsi_nn_tensor_attr_t* attr,
|
vsi_nn_tensor_attr_t* attr
|
||||||
vsi_nn_preprocess_dtype_convert_t* data_convert
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
*output_attr = *attr;
|
*output_attr = *attr;
|
||||||
if(data_convert != NULL)
|
|
||||||
{
|
|
||||||
output_attr->dtype = data_convert->dtype;
|
|
||||||
}
|
|
||||||
output_attr->dtype.fmt = VSI_NN_DIM_FMT_NCHW;
|
output_attr->dtype.fmt = VSI_NN_DIM_FMT_NCHW;
|
||||||
output_attr->dim_num = VSI_NN_DIM_AUTO;
|
output_attr->dim_num = VSI_NN_DIM_AUTO;
|
||||||
output_attr->is_const = FALSE;
|
output_attr->is_const = FALSE;
|
||||||
|
|
@ -603,10 +606,11 @@ vsi_status vsi_nn_add_single_preproc_node
|
||||||
_set_preproc_node_out_attr(node, image_resize, &org_norm_tensor->attr, source_layout);
|
_set_preproc_node_out_attr(node, image_resize, &org_norm_tensor->attr, source_layout);
|
||||||
|
|
||||||
/* Set input tensor attr */
|
/* Set input tensor attr */
|
||||||
_set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size, source_format, source_layout);
|
_set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size,
|
||||||
|
source_format, source_layout, data_convert);
|
||||||
|
|
||||||
/* Set output tensor attr */
|
/* Set output tensor attr */
|
||||||
_set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr, data_convert);
|
_set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr);
|
||||||
|
|
||||||
/* Create new norm and virtual tensors */
|
/* Create new norm and virtual tensors */
|
||||||
if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 ||
|
if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 ||
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,7 @@
|
||||||
#include "utils/vsi_nn_dtype_util.h"
|
#include "utils/vsi_nn_dtype_util.h"
|
||||||
#include "utils/vsi_nn_util.h"
|
#include "utils/vsi_nn_util.h"
|
||||||
#include "vsi_nn_rnn_helper.h"
|
#include "vsi_nn_rnn_helper.h"
|
||||||
|
#include "vsi_nn_types_prv.h"
|
||||||
#include "vsi_nn_error.h"
|
#include "vsi_nn_error.h"
|
||||||
|
|
||||||
vsi_bool vsi_nn_rnn_find_best_kernel_size
|
vsi_bool vsi_nn_rnn_find_best_kernel_size
|
||||||
|
|
@ -804,7 +805,7 @@ vsi_status vsi_nn_rnn_data_check_aligned
|
||||||
vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size,
|
vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size,
|
||||||
input[i]->attr.dim_num, input[i]->attr.dtype.vx_type );
|
input[i]->attr.dim_num, input[i]->attr.dtype.vx_type );
|
||||||
|
|
||||||
if( ofst & 0x3f && !self->graph->ctx->config.support_stream_processor)
|
if( ofst & 0x3f && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
|
||||||
{
|
{
|
||||||
vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor);
|
vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor);
|
||||||
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
|
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
|
||||||
|
|
|
||||||
|
|
@ -155,6 +155,15 @@ static void print_tensor
|
||||||
tensor->attr.dtype.group_size);
|
tensor->attr.dtype.group_size);
|
||||||
ext_attr[count] = 0;
|
ext_attr[count] = 0;
|
||||||
break;
|
break;
|
||||||
|
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
|
||||||
|
count = snprintf(&ext_attr[0],
|
||||||
|
_EXT_ATTR_BUF_SZ,
|
||||||
|
"ASYM GPTQ axis=%d, count=%d, group_size=%d",
|
||||||
|
tensor->attr.dtype.group_channel_dim,
|
||||||
|
tensor->attr.dtype.group_count,
|
||||||
|
tensor->attr.dtype.group_size);
|
||||||
|
ext_attr[count] = 0;
|
||||||
|
break;
|
||||||
#endif
|
#endif
|
||||||
default:
|
default:
|
||||||
vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
|
vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
|
||||||
|
|
@ -449,6 +458,11 @@ static vsi_bool _init_tensor
|
||||||
scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
|
scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
|
||||||
CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
|
CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
|
||||||
memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
|
memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
|
||||||
|
zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
|
||||||
|
CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
|
||||||
|
memcpy(zeroPoints,
|
||||||
|
tensor->attr.dtype.zero_points,
|
||||||
|
tensor->attr.dtype.zero_points_dim * sizeof(int32_t));
|
||||||
params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
|
params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
|
||||||
params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
|
params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
|
||||||
params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
|
params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
|
||||||
|
|
@ -460,6 +474,32 @@ static vsi_bool _init_tensor
|
||||||
VSILOGE(
|
VSILOGE(
|
||||||
"can't support qnt_type "
|
"can't support qnt_type "
|
||||||
"VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC.");
|
"VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC.");
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC:
|
||||||
|
#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
|
||||||
|
params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP;
|
||||||
|
// This is a hack that driver doesn't support const scales
|
||||||
|
scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
|
||||||
|
CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
|
||||||
|
memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
|
||||||
|
zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
|
||||||
|
CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
|
||||||
|
memcpy(zeroPoints,
|
||||||
|
tensor->attr.dtype.group_zero_points,
|
||||||
|
tensor->attr.dtype.group_count * sizeof(int32_t));
|
||||||
|
params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
|
||||||
|
params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
|
||||||
|
params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
|
||||||
|
params.quant_data.affinePerGroup.scales = scales;
|
||||||
|
params.quant_data.affinePerGroup.zero_points = zeroPoints;
|
||||||
|
params.quant_data.affinePerGroup.zero_point_group_count = tensor->attr.dtype.group_count;
|
||||||
|
break;
|
||||||
|
#else
|
||||||
|
VSILOGE(
|
||||||
|
"can't support qnt_type "
|
||||||
|
"VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC.");
|
||||||
|
break;
|
||||||
#endif
|
#endif
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
|
@ -1788,6 +1828,57 @@ int8_t vsi_nn_GetTensorIsScalar
|
||||||
return _get_tensor_is_scalar((vsi_nn_tensor_prv_t*)tensor);
|
return _get_tensor_is_scalar((vsi_nn_tensor_prv_t*)tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int32_t _get_tensor_is_sparsity
|
||||||
|
(
|
||||||
|
vsi_nn_tensor_prv_t* tensor
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int32_t is_sparsity = FALSE;
|
||||||
|
if (NULL == tensor)
|
||||||
|
{
|
||||||
|
VSILOGE("To get is_sparsity, tensor pointer SHOULD NOT be NULL.");
|
||||||
|
goto final;
|
||||||
|
}
|
||||||
|
#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
|
||||||
|
is_sparsity = tensor->sparsity_type;
|
||||||
|
#endif
|
||||||
|
final:
|
||||||
|
return is_sparsity;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t vsi_nn_GetTensorIsSparsity
|
||||||
|
(
|
||||||
|
vsi_nn_tensor_t* tensor
|
||||||
|
)
|
||||||
|
{
|
||||||
|
return _get_tensor_is_sparsity((vsi_nn_tensor_prv_t*)tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
vsi_status vsi_nn_SetTensorIsSparsity
|
||||||
|
(
|
||||||
|
vsi_nn_tensor_t* tensor,
|
||||||
|
int32_t is_sparsity
|
||||||
|
)
|
||||||
|
{
|
||||||
|
VSI_UNREFERENCED(is_sparsity);
|
||||||
|
vsi_status status = VSI_SUCCESS;
|
||||||
|
if (NULL == tensor) {
|
||||||
|
status = VSI_FAILURE;
|
||||||
|
goto final;
|
||||||
|
}
|
||||||
|
#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
|
||||||
|
vxSetTensorAttribute(tensor->t,
|
||||||
|
VX_TENSOR_SPARSITY_TYPE,
|
||||||
|
&is_sparsity,
|
||||||
|
sizeof(vx_enum));
|
||||||
|
status = VSI_SUCCESS;
|
||||||
|
((vsi_nn_tensor_prv_t*)tensor)->sparsity_type = is_sparsity;
|
||||||
|
#endif
|
||||||
|
final:
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
vsi_status vsi_nn_CopyRawDataToTensor
|
vsi_status vsi_nn_CopyRawDataToTensor
|
||||||
(
|
(
|
||||||
vsi_nn_graph_t* graph,
|
vsi_nn_graph_t* graph,
|
||||||
|
|
|
||||||
|
|
@ -75,6 +75,11 @@ vsi_status _set_tensor_is_scalar
|
||||||
int8_t is_salar
|
int8_t is_salar
|
||||||
);
|
);
|
||||||
|
|
||||||
|
vsi_status _set_tensor_is_sparsity(
|
||||||
|
vsi_nn_tensor_prv_t* tensor,
|
||||||
|
int32_t is_sparsity
|
||||||
|
);
|
||||||
|
|
||||||
int8_t _get_tensor_is_from_axisram
|
int8_t _get_tensor_is_from_axisram
|
||||||
(
|
(
|
||||||
vsi_nn_tensor_prv_t* tensor
|
vsi_nn_tensor_prv_t* tensor
|
||||||
|
|
@ -127,6 +132,11 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
|
||||||
vsi_nn_opt_direction_e direction
|
vsi_nn_opt_direction_e direction
|
||||||
);
|
);
|
||||||
|
|
||||||
|
uint32_t vsi_nn_get_tensor_dims
|
||||||
|
(
|
||||||
|
vsi_nn_tensor_t* tensor
|
||||||
|
);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -108,6 +108,11 @@ typedef struct _vsi_nn_tensor_prv
|
||||||
/** create tensor from axisram.*/
|
/** create tensor from axisram.*/
|
||||||
int8_t is_from_axisram;
|
int8_t is_from_axisram;
|
||||||
|
|
||||||
|
/** 2:4 sparsity attr. */
|
||||||
|
#if defined(VSI_TENSOR_SPARSITY_SUPPORT)
|
||||||
|
vx_tensor_sparsity_param_e sparsity_type; /*!< \brief sparsity type for the tensor */
|
||||||
|
#endif
|
||||||
|
|
||||||
// Add tensor internal attribute here...
|
// Add tensor internal attribute here...
|
||||||
} vsi_nn_tensor_prv_t;
|
} vsi_nn_tensor_prv_t;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue