Compare commits
10 Commits
720fe0306f
...
c05cfdc623
| Author | SHA1 | Date |
|---|---|---|
|
|
c05cfdc623 | |
|
|
1ad3aabcfe | |
|
|
7b24f4d437 | |
|
|
3c83eca946 | |
|
|
c4e75674fa | |
|
|
6810d310d3 | |
|
|
8494275d76 | |
|
|
149834832c | |
|
|
fcdf223d06 | |
|
|
81b6c07c5d |
|
|
@ -35,7 +35,7 @@ jobs:
|
|||
run: |
|
||||
cmake --install ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
|
||||
- name: upload tim-vx-install
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: tim-vx-install
|
||||
path: |
|
||||
|
|
@ -75,7 +75,7 @@ jobs:
|
|||
VIVANTE_SDK_DIR: ${{github.workspace}}/prebuilt-sdk/x86_64_linux/
|
||||
steps:
|
||||
- name: download tim-vx build output
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: tim-vx-install
|
||||
|
||||
|
|
@ -102,7 +102,7 @@ jobs:
|
|||
VIV_VX_DISABLE_TP_NN_EVIS: 1
|
||||
steps:
|
||||
- name: download tim-vx build output
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: tim-vx-install
|
||||
|
||||
|
|
@ -117,21 +117,21 @@ jobs:
|
|||
needs: tim-vx-build
|
||||
steps:
|
||||
- name: download tim-vx build output
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: tim-vx-install
|
||||
- name: apply-patch-build
|
||||
run: |
|
||||
git config --global user.email "xiang.zhang@verisilicon.com"
|
||||
git config --global user.name "xiang.zhang"
|
||||
git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.10.0
|
||||
git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.16.1
|
||||
git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate
|
||||
cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF
|
||||
cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}}
|
||||
cd ${{github.workspace}}/vx-delegate/build
|
||||
make vx_delegate benchmark_model
|
||||
- name: upload vx-delegate
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: vx-delegate-bin
|
||||
path: |
|
||||
|
|
@ -144,7 +144,7 @@ jobs:
|
|||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download binary
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
|
||||
- name: download mobilenet_v2_quant.tflite
|
||||
run: |
|
||||
|
|
@ -159,7 +159,7 @@ jobs:
|
|||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download binary
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
- name: download mobilenet_v2_b8_quant.tflite
|
||||
run: |
|
||||
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mobilenet_v2_b8_quant.tflite
|
||||
|
|
@ -173,7 +173,7 @@ jobs:
|
|||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
- name: download resnet_quant.tflite
|
||||
run: |
|
||||
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/resnet_quant.tflite
|
||||
|
|
@ -187,7 +187,7 @@ jobs:
|
|||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
- name: download model
|
||||
run: |
|
||||
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/inception_v3_quant.tflite
|
||||
|
|
@ -201,7 +201,7 @@ jobs:
|
|||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
- name: download model
|
||||
run: |
|
||||
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mobilenet_v3_b4_quant.tflite
|
||||
|
|
@ -215,7 +215,7 @@ jobs:
|
|||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
- name: download model
|
||||
run: |
|
||||
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mobilenet_v3_quant.tflite
|
||||
|
|
@ -229,7 +229,7 @@ jobs:
|
|||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
- name: download model
|
||||
run: |
|
||||
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mv3_depth_quant.tflite
|
||||
|
|
@ -243,7 +243,7 @@ jobs:
|
|||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
- name: download model
|
||||
run: |
|
||||
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/yolo_v4_tiny_quant.tflite
|
||||
|
|
@ -258,7 +258,7 @@ jobs:
|
|||
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
# steps:
|
||||
# - name: download test binary
|
||||
# uses: actions/download-artifact@v3
|
||||
# uses: actions/download-artifact@v4
|
||||
# - name: download model
|
||||
# run: |
|
||||
# curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/deeplab_v3_plus_quant.tflite
|
||||
|
|
@ -273,7 +273,7 @@ jobs:
|
|||
# needs: vx-delegate-build
|
||||
# steps:
|
||||
# - name: download test binary
|
||||
# uses: actions/download-artifact@v3
|
||||
# uses: actions/download-artifact@v4
|
||||
# - name: download model
|
||||
# run: |
|
||||
# wget https://storage.googleapis.com/tfhub-lite-models/google/lite-model/movenet/multipose/lightning/tflite/float16/1.tflite
|
||||
|
|
@ -283,68 +283,68 @@ jobs:
|
|||
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite
|
||||
|
||||
tfhub-efficientdet-lite0:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
- name: download model
|
||||
run: |
|
||||
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
|
||||
- name: benchmark-model
|
||||
run: |
|
||||
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||
# tfhub-efficientdet-lite0:
|
||||
# runs-on: ubuntu-latest
|
||||
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
# steps:
|
||||
# - name: download test binary
|
||||
# uses: actions/download-artifact@v4
|
||||
# - name: download model
|
||||
# run: |
|
||||
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
|
||||
# - name: benchmark-model
|
||||
# run: |
|
||||
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||
|
||||
tfhub-efficientdet-lite1:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
- name: download model
|
||||
run: |
|
||||
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
|
||||
- name: benchmark-model
|
||||
run: |
|
||||
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||
# tfhub-efficientdet-lite1:
|
||||
# runs-on: ubuntu-latest
|
||||
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
# steps:
|
||||
# - name: download test binary
|
||||
# uses: actions/download-artifact@v4
|
||||
# - name: download model
|
||||
# run: |
|
||||
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
|
||||
# - name: benchmark-model
|
||||
# run: |
|
||||
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||
|
||||
tfhub-efficientdet-lite2:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
- name: download model
|
||||
run: |
|
||||
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
|
||||
- name: benchmark-model
|
||||
run: |
|
||||
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||
# tfhub-efficientdet-lite2:
|
||||
# runs-on: ubuntu-latest
|
||||
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
# steps:
|
||||
# - name: download test binary
|
||||
# uses: actions/download-artifact@v4
|
||||
# - name: download model
|
||||
# run: |
|
||||
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
|
||||
# - name: benchmark-model
|
||||
# run: |
|
||||
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||
|
||||
tfhub-efficientdet-lite3:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
steps:
|
||||
- name: download test binary
|
||||
uses: actions/download-artifact@v3
|
||||
- name: download model
|
||||
run: |
|
||||
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
|
||||
- name: benchmark-model
|
||||
run: |
|
||||
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||
# tfhub-efficientdet-lite3:
|
||||
# runs-on: ubuntu-latest
|
||||
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
# steps:
|
||||
# - name: download test binary
|
||||
# uses: actions/download-artifact@v4
|
||||
# - name: download model
|
||||
# run: |
|
||||
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
|
||||
# - name: benchmark-model
|
||||
# run: |
|
||||
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
|
||||
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
|
||||
|
||||
# acuity-yolov3-608-quant:
|
||||
# runs-on: ubuntu-latest
|
||||
# needs: [vx-delegate-build, tim-vx-unit-test]
|
||||
# steps:
|
||||
# - name: download test binary
|
||||
# uses: actions/download-artifact@v3
|
||||
# uses: actions/download-artifact@v4
|
||||
# - name: download model
|
||||
# run: |
|
||||
# curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/yolov3_608relu_quant.acuity.tflite
|
||||
|
|
@ -359,7 +359,7 @@ jobs:
|
|||
# needs: vx-delegate-build
|
||||
# steps:
|
||||
# - name: download test binary
|
||||
# uses: actions/download-artifact@v3
|
||||
# uses: actions/download-artifact@v4
|
||||
# - name: download model
|
||||
# run: |
|
||||
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite4/detection/metadata/1.tflite
|
||||
|
|
|
|||
|
|
@ -2,13 +2,13 @@ cmake_minimum_required (VERSION 3.14)
|
|||
project(tim-vx LANGUAGES C CXX)
|
||||
|
||||
option(BUILD_SHARED_LIBS "Build using shared libraries" ON)
|
||||
option(TIM_VX_ENABLE_CUSTOM_OP "Enable custom op support" OFF)
|
||||
option(TIM_VX_ENABLE_CUSTOM_OP "Enable custom op support" ON)
|
||||
option(TIM_VX_ENABLE_TEST "Build the unit test" OFF)
|
||||
option(TIM_VX_ENABLE_LAYOUT_INFER "Enable layout inference support" ON)
|
||||
option(TIM_VX_ENABLE_NBG_PARSER "Enable NBG parser" OFF)
|
||||
option(TIM_VX_ENABLE_NBG_PARSER "Enable NBG parser" ON)
|
||||
option(TIM_VX_CODE_COVERAGE "Run code coverage with gconv(gcc only" OFF)
|
||||
option(TIM_VX_USE_EXTERNAL_OVXLIB "Use external OVXLIB" OFF)
|
||||
option(TIM_VX_BUILD_EXAMPLES "Build demos show general usage" OFF)
|
||||
option(TIM_VX_BUILD_EXAMPLES "Build demos show general usage" ON)
|
||||
option(TIM_VX_ENABLE_VIPLITE "Enable lite driver api support" OFF)
|
||||
option(TIM_VX_ENABLE_40BIT "Enable large memory support" OFF)
|
||||
option(TIM_VX_ENABLE_PLATFORM "Enable multi devices support" OFF)
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ Main Features
|
|||
- [TVM](https://github.com/VeriSilicon/tvm) (Fork)
|
||||
- [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) (Official)
|
||||
- [OpenCV](https://github.com/opencv/opencv/wiki/TIM-VX-Backend-For-Running-OpenCV-On-NPU) (Offical)
|
||||
- MLIR Dialect (In development)
|
||||
- [ONNXRuntime](https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/vsinpu) (Official)
|
||||
|
||||
Feel free to raise a github issue if you wish to add TIM-VX for other frameworks.
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,11 @@ list(APPEND OVXDRV_INCLUDE_DIRS
|
|||
if("${CONFIG}" STREQUAL "BUILDROOT")
|
||||
set(VIV_SDK_DRIVER_PREFIX "usr/lib")
|
||||
else()
|
||||
set(VIV_SDK_DRIVER_PREFIX "drivers")
|
||||
if(EXISTS ${EXTERNAL_VIV_SDK}/drivers)
|
||||
set(VIV_SDK_DRIVER_PREFIX "drivers")
|
||||
else()
|
||||
set(VIV_SDK_DRIVER_PREFIX "lib")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
message("using driver libs from ${EXTERNAL_VIV_SDK}/${VIV_SDK_DRIVER_PREFIX}")
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 56 KiB After Width: | Height: | Size: 53 KiB |
|
|
@ -25,72 +25,58 @@
|
|||
#define TIM_VX_LITE_NATIVE_H_
|
||||
|
||||
#include "tim/vx/platform/platform.h"
|
||||
#include "vip_lite.h"
|
||||
#include "nbg_linker.h"
|
||||
|
||||
namespace tim {
|
||||
namespace vx {
|
||||
namespace platform {
|
||||
|
||||
class LiteNativeExecutor
|
||||
: public IExecutor,
|
||||
public std::enable_shared_from_this<LiteNativeExecutor> {
|
||||
class LiteNativeDevice : public IDevice {
|
||||
public:
|
||||
LiteNativeExecutor(const std::shared_ptr<IDevice>& device);
|
||||
virtual ~LiteNativeExecutor();
|
||||
bool Submit(const std::shared_ptr<IExecutable>& executable,
|
||||
const std::shared_ptr<IExecutable>& ref,
|
||||
bool after = true) override;
|
||||
bool Trigger(bool async = false) override;
|
||||
std::shared_ptr<IExecutable> Compile(
|
||||
const std::shared_ptr<Graph>& graph) override;
|
||||
|
||||
private:
|
||||
vip_task_descriptor_t* task_descriptor_;
|
||||
vip_database database_;
|
||||
virtual ~LiteNativeDevice() {};
|
||||
virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
|
||||
virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
|
||||
virtual bool DeviceExit() = 0;
|
||||
virtual void WaitDeviceIdle() = 0;
|
||||
virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
|
||||
const int32_t core_count = -1,
|
||||
const std::shared_ptr<Context>& context = nullptr) = 0;
|
||||
static std::vector<std::shared_ptr<IDevice>> Enumerate();
|
||||
static bool vip_initialized;
|
||||
};
|
||||
class LiteNativeExecutor
|
||||
: public IExecutor {
|
||||
public:
|
||||
virtual ~LiteNativeExecutor() {};
|
||||
virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
|
||||
const std::shared_ptr<IExecutable>& ref,
|
||||
bool after = true) = 0;
|
||||
virtual bool Trigger(bool async = false) = 0;
|
||||
virtual std::shared_ptr<IExecutable> Compile(
|
||||
const std::shared_ptr<Graph>& graph) = 0;
|
||||
};
|
||||
|
||||
class LiteNativeExecutable : public IExecutable {
|
||||
public:
|
||||
LiteNativeExecutable(const std::shared_ptr<IExecutor>& executor,
|
||||
const std::vector<char>& nb_buf);
|
||||
virtual ~LiteNativeExecutable();
|
||||
void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
|
||||
void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
|
||||
void GetOutput(
|
||||
const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
|
||||
bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) override;
|
||||
bool Trigger(bool async) override;
|
||||
bool Verify() override;
|
||||
std::shared_ptr<ITensorHandle> AllocateTensor(
|
||||
const TensorSpec& tensor_spec) override;
|
||||
|
||||
vip_network network_;
|
||||
|
||||
private:
|
||||
void SetBuffer(vip_memory_t* dst, gcvip_videomemory_t* src);
|
||||
|
||||
int32_t input_count_;
|
||||
int32_t output_count_;
|
||||
|
||||
gcvip_videomemory_t* coeff_;
|
||||
gcvip_videomemory_t* command_;
|
||||
gcvip_videomemory_t* memory_pool_;
|
||||
gcvip_videomemory_t* others_;
|
||||
gcvip_videomemory_t* pre_command_;
|
||||
virtual ~LiteNativeExecutable() {};
|
||||
virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
|
||||
virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
|
||||
virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
|
||||
virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
|
||||
virtual bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) = 0;
|
||||
virtual bool Trigger(bool async) = 0;
|
||||
virtual bool Verify() = 0;
|
||||
virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
|
||||
void* data = nullptr, uint32_t size = 0) = 0;
|
||||
};
|
||||
|
||||
class LiteNativeTensorHandle : public ITensorHandle {
|
||||
public:
|
||||
LiteNativeTensorHandle(const std::shared_ptr<Tensor>& tensr);
|
||||
virtual ~LiteNativeTensorHandle();
|
||||
bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
|
||||
bool CopyDataFromTensor(void* data) override;
|
||||
|
||||
gcvip_videomemory_t* tensor_buffer_;
|
||||
virtual ~LiteNativeTensorHandle() {};
|
||||
bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
|
||||
bool CopyDataFromTensor(void* data) = 0;
|
||||
};
|
||||
} // namespace platform
|
||||
} // namespace vx
|
||||
} // namespace tim
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -37,51 +37,41 @@ class NativeDevice : public IDevice {
|
|||
virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
|
||||
virtual bool DeviceExit() = 0;
|
||||
virtual void WaitDeviceIdle() = 0;
|
||||
virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
|
||||
const int32_t core_count = -1,
|
||||
const std::shared_ptr<Context>& context = nullptr) = 0;
|
||||
static std::vector<std::shared_ptr<IDevice>> Enumerate();
|
||||
};
|
||||
|
||||
class NativeExecutable : public IExecutable {
|
||||
public:
|
||||
NativeExecutable(const std::shared_ptr<IExecutor>& executor,
|
||||
const std::vector<char>& nb_buf, size_t inputs,
|
||||
size_t outputs);
|
||||
~NativeExecutable(){};
|
||||
void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
|
||||
void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
|
||||
void GetOutput(
|
||||
const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
|
||||
bool Submit(const std::shared_ptr<IExecutable>& ref,
|
||||
bool after = true) override;
|
||||
bool Trigger(bool async = false) override;
|
||||
std::shared_ptr<ITensorHandle> AllocateTensor(
|
||||
const TensorSpec& tensor_spec) override;
|
||||
bool Verify() override;
|
||||
|
||||
protected:
|
||||
std::shared_ptr<tim::vx::ops::NBG> nb_node_;
|
||||
std::vector<char> nb_buf_;
|
||||
virtual ~NativeExecutable() {};
|
||||
virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
|
||||
virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
|
||||
virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
|
||||
virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
|
||||
virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
|
||||
bool after = true) = 0;
|
||||
virtual bool Trigger(bool async = false) = 0;
|
||||
virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
|
||||
void* data = nullptr, uint32_t size = 0) = 0;
|
||||
virtual bool Verify() = 0;
|
||||
};
|
||||
|
||||
class NativeExecutor : public IExecutor,
|
||||
public std::enable_shared_from_this<NativeExecutor> {
|
||||
class NativeExecutor : public IExecutor {
|
||||
public:
|
||||
NativeExecutor(const std::shared_ptr<IDevice>& device);
|
||||
NativeExecutor(const std::shared_ptr<IDevice>& device,
|
||||
const std::shared_ptr<Context>& context);
|
||||
~NativeExecutor(){};
|
||||
bool Submit(const std::shared_ptr<IExecutable>& executable,
|
||||
const std::shared_ptr<IExecutable>& ref,
|
||||
bool after = true) override;
|
||||
bool Trigger(bool async = false) override;
|
||||
std::shared_ptr<IExecutable> Compile(
|
||||
const std::shared_ptr<Graph>& graph) override;
|
||||
virtual ~NativeExecutor(){};
|
||||
virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
|
||||
const std::shared_ptr<IExecutable>& ref,
|
||||
bool after = true) = 0;
|
||||
virtual bool Trigger(bool async = false) = 0;
|
||||
virtual std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) = 0;
|
||||
};
|
||||
|
||||
class NativeTensorHandle : public ITensorHandle {
|
||||
public:
|
||||
NativeTensorHandle(const std::shared_ptr<Tensor>& tensor);
|
||||
bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
|
||||
bool CopyDataFromTensor(void* data) override;
|
||||
virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
|
||||
virtual bool CopyDataFromTensor(void* data) = 0;
|
||||
};
|
||||
|
||||
} // namespace platform
|
||||
|
|
|
|||
|
|
@ -46,15 +46,12 @@ namespace platform {
|
|||
|
||||
class IDevice;
|
||||
class IExecutable;
|
||||
class ExecutableSet;
|
||||
class IExecutor;
|
||||
class ITensorHandle;
|
||||
|
||||
std::shared_ptr<IExecutable> Compile(
|
||||
const std::shared_ptr<Graph>& graph,
|
||||
const std::shared_ptr<IExecutor>& executor);
|
||||
std::shared_ptr<IExecutable> CreateExecutableSet(
|
||||
const std::vector<std::shared_ptr<IExecutable>>& executables);
|
||||
|
||||
class IDevice {
|
||||
public:
|
||||
|
|
@ -68,17 +65,25 @@ class IDevice {
|
|||
virtual ~IDevice(){};
|
||||
virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
|
||||
virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
|
||||
device_id_t Id() const;
|
||||
device_id_t Id() const { return device_id_;};
|
||||
virtual void WaitDeviceIdle() = 0;
|
||||
virtual bool DeviceExit() = 0;
|
||||
virtual void RemoteReset();
|
||||
uint32_t CoreCount() const {return core_count_;};
|
||||
virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
|
||||
const int32_t core_count = -1,
|
||||
const std::shared_ptr<Context>& context = nullptr) = 0;
|
||||
static std::vector<std::shared_ptr<IDevice>> Enumerate();
|
||||
|
||||
protected:
|
||||
device_id_t device_id_;
|
||||
uint32_t core_count_;
|
||||
|
||||
};
|
||||
|
||||
class IExecutor {
|
||||
public:
|
||||
//using task = std::shared_ptr<IExecutable>;
|
||||
using task = std::weak_ptr<IExecutable>;
|
||||
virtual ~IExecutor(){};
|
||||
virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
|
||||
|
|
@ -87,13 +92,17 @@ class IExecutor {
|
|||
virtual bool Trigger(bool async = false) = 0; // todo: async=true
|
||||
virtual std::shared_ptr<IExecutable> Compile(
|
||||
const std::shared_ptr<Graph>& graph) = 0;
|
||||
virtual std::shared_ptr<IDevice> Device() const;
|
||||
virtual std::shared_ptr<Context> Contex() const;
|
||||
|
||||
virtual std::shared_ptr<IDevice> Device() const {return device_;};
|
||||
virtual std::shared_ptr<Context> Contex() const {return context_;};
|
||||
virtual uint32_t CoreIndex() const {return core_index_; };
|
||||
virtual uint32_t CoreCount() const {return core_count_; };
|
||||
protected:
|
||||
std::vector<task> tasks_;
|
||||
std::shared_ptr<IDevice> device_;
|
||||
std::shared_ptr<Context> context_;
|
||||
uint32_t core_index_;
|
||||
uint32_t core_count_;
|
||||
|
||||
};
|
||||
|
||||
class IExecutable : public std::enable_shared_from_this<IExecutable> {
|
||||
|
|
@ -101,40 +110,24 @@ class IExecutable : public std::enable_shared_from_this<IExecutable> {
|
|||
virtual ~IExecutable(){};
|
||||
virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
|
||||
virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
|
||||
virtual void GetOutput(
|
||||
const std::vector<std::shared_ptr<ITensorHandle>>& th) = 0; // for remote
|
||||
virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
|
||||
virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
|
||||
virtual std::vector<std::shared_ptr<ITensorHandle>> GetOutputs() { return input_handles_;};
|
||||
virtual std::vector<std::shared_ptr<ITensorHandle>> Getinputs() { return input_handles_;};
|
||||
virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
|
||||
bool after = true) = 0;
|
||||
virtual bool Trigger(bool async = false) = 0; // todo: async=true
|
||||
virtual bool Verify() = 0;
|
||||
virtual std::shared_ptr<Graph> NBGraph() const;
|
||||
virtual std::shared_ptr<ITensorHandle> AllocateTensor(
|
||||
const TensorSpec& tensor_spec) = 0;
|
||||
virtual std::shared_ptr<IExecutor> Executor() const;
|
||||
std::shared_ptr<Graph> NBGraph() const {return nb_graph_;};
|
||||
virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec ,
|
||||
void* data = nullptr, uint32_t size = 0) = 0;
|
||||
|
||||
protected:
|
||||
std::weak_ptr<IExecutor> executor_;
|
||||
std::shared_ptr<Context> context_;
|
||||
std::shared_ptr<Graph> nb_graph_;
|
||||
};
|
||||
|
||||
class ExecutableSet : public IExecutable {
|
||||
public:
|
||||
ExecutableSet(const std::vector<std::shared_ptr<IExecutable>>& executables);
|
||||
void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
|
||||
void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
|
||||
void GetOutput(
|
||||
const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
|
||||
bool Submit(const std::shared_ptr<IExecutable>& ref,
|
||||
bool after = true) override;
|
||||
bool Trigger(bool async = false) override;
|
||||
bool Verify() override;
|
||||
std::shared_ptr<ITensorHandle> AllocateTensor(
|
||||
const TensorSpec& tensor_spec) override;
|
||||
std::vector<std::shared_ptr<IExecutable>> Executables() const;
|
||||
|
||||
protected:
|
||||
std::vector<std::shared_ptr<IExecutable>> executables_;
|
||||
std::vector<std::shared_ptr<ITensorHandle>> input_handles_;
|
||||
std::vector<std::shared_ptr<ITensorHandle>> output_handles_;
|
||||
};
|
||||
|
||||
class ITensorHandle {
|
||||
|
|
@ -142,13 +135,15 @@ class ITensorHandle {
|
|||
virtual ~ITensorHandle(){};
|
||||
virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
|
||||
virtual bool CopyDataFromTensor(void* data) = 0;
|
||||
virtual std::shared_ptr<Tensor> GetTensor() const;
|
||||
virtual std::shared_ptr<Tensor> GetTensor() const { return tensor_;};
|
||||
virtual TensorSpec& GetSpec() { return spec_;};
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Tensor> tensor_;
|
||||
TensorSpec spec_;
|
||||
};
|
||||
|
||||
} // namespace platform
|
||||
} // namespace vx
|
||||
} // namespace tim
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -20,9 +20,7 @@ endif()
|
|||
if(TIM_VX_ENABLE_PLATFORM)
|
||||
add_subdirectory("lenet_multi_device")
|
||||
add_subdirectory("multi_device")
|
||||
if(${TIM_VX_ENABLE_PLATFORM_LITE})
|
||||
add_subdirectory("lite_multi_device")
|
||||
endif()
|
||||
add_subdirectory("platform_sample")
|
||||
if(TIM_VX_ENABLE_GRPC)
|
||||
add_subdirectory("grpc")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -11,5 +11,10 @@ target_include_directories(${TARGET_NAME} PRIVATE
|
|||
${PROJECT_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
target_include_directories(${TARGET_NAME} PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${PROJECT_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
|
||||
DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@
|
|||
#include "tim/vx/context.h"
|
||||
#include "tim/vx/graph.h"
|
||||
#include "tim/vx/platform/platform.h"
|
||||
#include "tim/vx/platform/native.h"
|
||||
|
||||
std::vector<uint8_t> input_data = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 8, 0,
|
||||
|
|
@ -108,17 +107,17 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
(void) argc, (void) argv;
|
||||
auto context0 = tim::vx::Context::Create();
|
||||
auto graph0 = lenet(context0);
|
||||
auto graph1 = lenet(context0);
|
||||
|
||||
auto devices = tim::vx::platform::NativeDevice::Enumerate();
|
||||
auto devices = tim::vx::platform::IDevice::Enumerate();
|
||||
auto device = devices[0];
|
||||
std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
|
||||
|
||||
auto executable0 = tim::vx::platform::Compile(graph0, executor); // compile to nbg
|
||||
auto executor = device->CreateExecutor(0,-1,context0);
|
||||
auto executable0 = tim::vx::platform::Compile(graph0, executor);
|
||||
auto input_handle0 = executable0->AllocateTensor(graph0->InputsTensor()[0]->GetSpec());
|
||||
auto output_handle0 = executable0->AllocateTensor(graph0->OutputsTensor()[0]->GetSpec());
|
||||
executable0->SetInput(input_handle0);
|
||||
|
|
@ -127,7 +126,18 @@ int main(int argc, char** argv) {
|
|||
assert(executable0->Submit(executable0));
|
||||
executable0->Trigger();
|
||||
|
||||
auto executable1 = tim::vx::platform::Compile(graph1, executor); // compile to nbg
|
||||
std::vector<float> output_data;
|
||||
output_data.resize(1 * 10);
|
||||
if (!output_handle0->CopyDataFromTensor(output_data.data())) {
|
||||
std::cout << "Copy output data fail." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
std::cout << "executable0 out." << std::endl;
|
||||
printTopN(output_data.data(), output_data.size(), 5);
|
||||
output_data.assign(output_data.size(),0);
|
||||
output_handle0->CopyDataToTensor(output_data.data(), output_data.size());
|
||||
|
||||
auto executable1 = tim::vx::platform::Compile(graph1, executor);
|
||||
auto input_handle1 = executable1->AllocateTensor(graph1->InputsTensor()[0]->GetSpec());
|
||||
auto output_handle1 = executable1->AllocateTensor(graph1->OutputsTensor()[0]->GetSpec());
|
||||
executable1->SetInput(input_handle1);
|
||||
|
|
@ -136,34 +146,28 @@ int main(int argc, char** argv) {
|
|||
assert(executable1->Submit(executable0));
|
||||
executable1->Trigger();
|
||||
|
||||
std::vector<float> output_data1;
|
||||
output_data1.resize(1 * 10);
|
||||
if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
|
||||
std::cout << "Copy output data fail." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
std::cout << "executable1 out." << std::endl;
|
||||
printTopN(output_data1.data(), output_data1.size(), 5);
|
||||
output_data1.assign(output_data1.size(),0);
|
||||
output_handle1->CopyDataToTensor(output_data1.data(), output_data1.size());
|
||||
|
||||
executor->Submit(executable0, executable0);
|
||||
executor->Submit(executable1, executable0);
|
||||
|
||||
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
|
||||
executables0.push_back(executable0);
|
||||
executables0.push_back(executable1);
|
||||
auto executable_set0 = tim::vx::platform::CreateExecutableSet(executables0);
|
||||
executor->Submit(executable_set0, executable_set0);
|
||||
executor->Trigger();
|
||||
|
||||
std::vector<uint8_t> input_data0;
|
||||
input_data0.resize(28 * 28);
|
||||
if (!input_handle0->CopyDataFromTensor(input_data0.data())) {
|
||||
std::cout << "Copy intput data fail." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
printTopN(input_data0.data(), input_data0.size(), 5);
|
||||
|
||||
std::vector<float> output_data;
|
||||
output_data.resize(1 * 10);
|
||||
std::cout << "executor out." << std::endl;
|
||||
if (!output_handle0->CopyDataFromTensor(output_data.data())) {
|
||||
std::cout << "Copy output data fail." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
printTopN(output_data.data(), output_data.size(), 5);
|
||||
|
||||
std::vector<float> output_data1;
|
||||
output_data1.resize(1 * 10);
|
||||
if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
|
||||
std::cout << "Copy output data fail." << std::endl;
|
||||
return -1;
|
||||
|
|
|
|||
|
|
@ -1,13 +0,0 @@
|
|||
message("samples/lite_multi_device")
|
||||
|
||||
set(TARGET_NAME "lite_multi_device")
|
||||
|
||||
add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/lite_multi_device.cc)
|
||||
|
||||
target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
|
||||
target_include_directories(${TARGET_NAME} PRIVATE
|
||||
${PROJECT_SOURCE_DIR}/include
|
||||
${PROJECT_SOURCE_DIR}/prebuilt-sdk/viplite/build/sdk/include)
|
||||
|
||||
install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
|
||||
DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
|
||||
|
|
@ -1,15 +1,25 @@
|
|||
## brief
|
||||
The multi_device demo uses some acuity exported tim-vx networks, and running on 4 devices of NPU using platform api.
|
||||
The multi_device demo uses some acuity exported tim-vx networks, and running on multi-core devices of NPU using platform api.
|
||||
|
||||
## environment
|
||||
export VSIMULATOR_CONFIG=VIP9400O_PID0XD9
|
||||
export VIV_MGPU_AFFINITY="1:0"
|
||||
export VIV_OVX_USE_MULTI_DEVICE="1:1"
|
||||
export TIM_VX_ROOT="${workspaceFolder}/tim-vx"
|
||||
## note
|
||||
Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
|
||||
The driver for the NPU is the VIPLITE driver
|
||||
|
||||
##requirements
|
||||
Vivante SDK >= 6.4.22
|
||||
ovxlib >= 1.2.26
|
||||
viplite >=2.0.0
|
||||
|
||||
## build
|
||||
cd build
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Debug -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
|
||||
-DOVXLIB_LIB=/path/to/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
|
||||
|
||||
## environment
|
||||
# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
|
||||
# VIV_GPU_FILE Specify the NPU hardware configuration file for the NBG compiler
|
||||
export VIV_GPU_FILE="/path/to/VIP9400NANOQ_PLUS_PID0X10000055.config"
|
||||
export TIM_VX_ROOT="${workspaceFolder}/tim-vx"
|
||||
|
||||
## run
|
||||
cd build
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "tim/vx/context.h"
|
||||
#include "tim/vx/graph.h"
|
||||
#include "tim/vx/platform/platform.h"
|
||||
#include "tim/vx/platform/native.h"
|
||||
#include "vx_lenet.h"
|
||||
#include "vx_mobilenet.h"
|
||||
#include "vx_resnet50.h"
|
||||
|
|
@ -59,7 +58,7 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> handle) {
|
||||
void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> & handle) {
|
||||
std::vector<T> output_data;
|
||||
output_data.resize(size);
|
||||
if (!handle->CopyDataFromTensor(output_data.data())) {
|
||||
|
|
@ -94,7 +93,8 @@ void executor_trigger(std::shared_ptr<tim::vx::platform::IExecutor> executor) {
|
|||
}
|
||||
|
||||
auto context = tim::vx::Context::Create();
|
||||
std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>> generate_executable(
|
||||
std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>>
|
||||
generate_executable(
|
||||
std::shared_ptr<tim::vx::platform::IExecutor> executor,
|
||||
std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> construct_func,
|
||||
std::string weight_file,
|
||||
|
|
@ -114,15 +114,17 @@ std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::
|
|||
|
||||
int main(int argc, char** argv) {
|
||||
(void) argc, (void) argv;
|
||||
auto devices = tim::vx::platform::NativeDevice::Enumerate();
|
||||
auto devices = tim::vx::platform::IDevice::Enumerate();
|
||||
auto device0 = devices[0];
|
||||
std::shared_ptr<tim::vx::platform::IExecutor> executor0 = std::make_shared<tim::vx::platform::NativeExecutor> (device0);
|
||||
auto device1 = devices[1];
|
||||
std::shared_ptr<tim::vx::platform::IExecutor> executor1 = std::make_shared<tim::vx::platform::NativeExecutor> (device1);
|
||||
auto device2 = devices[2];
|
||||
std::shared_ptr<tim::vx::platform::IExecutor> executor2 = std::make_shared<tim::vx::platform::NativeExecutor> (device2);
|
||||
auto device3 = devices[3];
|
||||
std::shared_ptr<tim::vx::platform::IExecutor> executor3 = std::make_shared<tim::vx::platform::NativeExecutor> (device3);
|
||||
auto total_core_count = device0->CoreCount();
|
||||
uint32_t core_index = 0;
|
||||
auto use_core_count = 1;
|
||||
std::vector<std::shared_ptr<tim::vx::platform::IExecutor>> executors;
|
||||
|
||||
for(core_index = 0; core_index < total_core_count; core_index += use_core_count) {
|
||||
auto executor = device0->CreateExecutor(core_index,use_core_count, context);
|
||||
executors.push_back(executor);
|
||||
}
|
||||
|
||||
auto root = std::getenv("TIM_VX_ROOT");
|
||||
assert(root != NULL);
|
||||
|
|
@ -142,46 +144,57 @@ int main(int argc, char** argv) {
|
|||
auto resnet50_weight_file = ROOT + "/samples/multi_device/resnet50/resnet50.export.data";
|
||||
std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> resnet50_construct_func = acuitylite::resnet50::construct_graph;
|
||||
|
||||
std::shared_ptr<tim::vx::platform::IExecutable> lenet_0, lenet_2, lenet_3, mobilenet_1, mobilenet_2, mobilenet_3, resnet50_0, resnet50_1;
|
||||
std::shared_ptr<tim::vx::platform::ITensorHandle> lenet_0_outhandle, lenet_2_outhandle, lenet_3_outhandle, mobilenet_1_outhandle, mobilenet_2_outhandle, mobilenet_3_outhandle,
|
||||
resnet50_0_outhandle, resnet50_1_outhandle;
|
||||
auto excutor_cnt = executors.size();
|
||||
|
||||
std::tie(lenet_0, lenet_0_outhandle) = generate_executable(executor0, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
|
||||
std::tie(resnet50_0, resnet50_0_outhandle) = generate_executable(executor0, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
|
||||
executor0->Submit(lenet_0, lenet_0);
|
||||
executor0->Submit(resnet50_0, lenet_0);
|
||||
//each excutor run 2 models.
|
||||
auto lenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
|
||||
return generate_executable(executor, lenet_construct_func, lenet_weight_file,
|
||||
lenet_input_files, lenet_input_bytes);
|
||||
};
|
||||
auto resnet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
|
||||
return generate_executable(executor, resnet50_construct_func, resnet50_weight_file,
|
||||
resnet50_input_files, resnet50_input_bytes);
|
||||
};
|
||||
auto mobilenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
|
||||
return generate_executable(executor, mobilenet_construct_func, mobilenet_weight_file,
|
||||
mobilenet_input_files, mobilenet_input_bytes);
|
||||
};
|
||||
std::vector<std::pair<std::shared_ptr<tim::vx::platform::IExecutable>,
|
||||
std::shared_ptr<tim::vx::platform::ITensorHandle>>> nets;
|
||||
for (size_t i = 0; i < excutor_cnt; i++) {
|
||||
if(i % 3 == 0) {
|
||||
//lenet + resnet
|
||||
nets.push_back(lenet(executors[i]));
|
||||
executors[i]->Submit(nets.back().first, nets.back().first);
|
||||
nets.push_back(resnet(executors[i]));
|
||||
executors[i]->Submit(nets.back().first, nets.back().first);
|
||||
}
|
||||
if(i % 3 == 1) {
|
||||
//resnet + mobilenet
|
||||
nets.push_back(resnet(executors[i]));
|
||||
executors[i]->Submit(nets.back().first, nets.back().first);
|
||||
nets.push_back(mobilenet(executors[i]));
|
||||
executors[i]->Submit(nets.back().first, nets.back().first);
|
||||
}
|
||||
if(i % 3 == 2) {
|
||||
//lenet + mobilenet
|
||||
nets.push_back(mobilenet(executors[i]));
|
||||
executors[i]->Submit(nets.back().first, nets.back().first);
|
||||
nets.push_back(lenet(executors[i]));
|
||||
executors[i]->Submit(nets.back().first, nets.back().first);
|
||||
}
|
||||
}
|
||||
std::vector<std::thread> threads;
|
||||
for(auto executor:executors) {
|
||||
threads.push_back(std::thread(executor_trigger, executor));
|
||||
}
|
||||
for(std::thread &t : threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
std::tie(mobilenet_1, mobilenet_1_outhandle) = generate_executable(executor1, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
|
||||
std::tie(resnet50_1, resnet50_1_outhandle) = generate_executable(executor1, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
|
||||
auto executable_set1 = tim::vx::platform::CreateExecutableSet({mobilenet_1, resnet50_1});
|
||||
executor1->Submit(executable_set1, executable_set1);
|
||||
|
||||
std::tie(lenet_2, lenet_2_outhandle) = generate_executable(executor2, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
|
||||
std::tie(mobilenet_2, mobilenet_2_outhandle) = generate_executable(executor2, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
|
||||
auto executable_set2 = tim::vx::platform::CreateExecutableSet({lenet_2, mobilenet_2});
|
||||
executor2->Submit(executable_set2, executable_set2);
|
||||
|
||||
std::tie(lenet_3, lenet_3_outhandle) = generate_executable(executor3, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
|
||||
std::tie(mobilenet_3, mobilenet_3_outhandle) = generate_executable(executor3, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
|
||||
auto executable_set3 = tim::vx::platform::CreateExecutableSet({lenet_3, mobilenet_3});
|
||||
executor3->Submit(executable_set3, executable_set3);
|
||||
|
||||
std::thread t0(executor_trigger, executor0);
|
||||
std::thread t1(executor_trigger, executor1);
|
||||
std::thread t2(executor_trigger, executor2);
|
||||
std::thread t3(executor_trigger, executor3);
|
||||
t0.join();
|
||||
t1.join();
|
||||
t2.join();
|
||||
t3.join();
|
||||
|
||||
print_topN<float>(1 * 10, lenet_0_outhandle);
|
||||
print_topN<float>(1 * 10, lenet_2_outhandle);
|
||||
print_topN<float>(1 * 10, lenet_3_outhandle);
|
||||
print_topN<float>(1 * 1001, mobilenet_1_outhandle);
|
||||
print_topN<float>(1 * 1001, mobilenet_2_outhandle);
|
||||
print_topN<float>(1 * 1001, mobilenet_3_outhandle);
|
||||
print_topN<uint16_t>(1 * 1000, resnet50_0_outhandle);
|
||||
print_topN<uint16_t>(1 * 1000, resnet50_1_outhandle);
|
||||
for (auto net : nets) {
|
||||
auto size = net.second->GetSpec().GetElementNum();
|
||||
print_topN<float>(size, net.second);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@
|
|||
#include "tim/vx/graph.h"
|
||||
#include "tim/vx/operation.h"
|
||||
#include "tim/vx/tensor.h"
|
||||
#include "tim/vx/platform/native.h"
|
||||
#include "tim/vx/platform/platform.h"
|
||||
|
||||
static void printTopN() {
|
||||
}
|
||||
|
|
@ -46,9 +46,9 @@ int demo(int argc, char** argv) {
|
|||
tim::vx::TensorSpec g0_input0, g0_output0, g1_output0, g2_output0, g3_output0, g4_output0, g5_output0;
|
||||
|
||||
// query device and get executor of devcie
|
||||
auto devices = tim::vx::platform::NativeDevice::Enumerate();
|
||||
auto devices = tim::vx::platform::IDevice::Enumerate();
|
||||
auto device = devices[0];
|
||||
std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
|
||||
auto executor = device->CreateExecutor(0,-1, context);
|
||||
|
||||
// executable0
|
||||
auto executable0 = executor->Compile(g0); // compile to nbg
|
||||
|
|
@ -89,33 +89,6 @@ int demo(int argc, char** argv) {
|
|||
// trigger
|
||||
executor->Trigger(); // run all submitted executables
|
||||
|
||||
/* 2. another way to run */
|
||||
// executable_set0
|
||||
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
|
||||
executables0.push_back(executable0);
|
||||
auto executable_set0 = CreateExecutableSet(executables0);
|
||||
// executable_set1
|
||||
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables1;
|
||||
executables1.push_back(executable1);
|
||||
executables1.push_back(executable3);
|
||||
auto executable_set1 = CreateExecutableSet(executables1);
|
||||
// executable_set2
|
||||
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables2;
|
||||
executables2.push_back(executable2);
|
||||
executables2.push_back(executable4);
|
||||
auto executable_set2 = CreateExecutableSet(executables2);
|
||||
// executable_set3
|
||||
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables3;
|
||||
executables3.push_back(executable5);
|
||||
auto executable_set3 = CreateExecutableSet(executables3);
|
||||
// submit executaleSets
|
||||
executable_set0->Submit(executable_set0);
|
||||
executable_set1->Submit(executable_set0);
|
||||
executable_set2->Submit(executable_set1);
|
||||
executable_set3->Submit(executable_set2);
|
||||
// trigger
|
||||
executor->Trigger(); // run all submitted executableSets
|
||||
|
||||
printTopN();
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -1296,7 +1296,7 @@ void resnet50::construct_graph
|
|||
auto input_0 = graph->CreateTensor(input_0_spec);
|
||||
|
||||
tim::vx::ShapeType output_229_shape({1000,1});
|
||||
tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT16, output_229_shape,
|
||||
tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT32, output_229_shape,
|
||||
tim::vx::TensorAttribute::OUTPUT);
|
||||
auto output_229 = graph->CreateTensor(output_229_spec);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,13 @@
|
|||
message("samples/platform_sample")
|
||||
|
||||
set(TARGET_NAME "platform_sample")
|
||||
|
||||
add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/platform_sample.cc)
|
||||
|
||||
target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
|
||||
target_include_directories(${TARGET_NAME} PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${PROJECT_SOURCE_DIR}/include)
|
||||
|
||||
install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
|
||||
DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
## brief
|
||||
The platform sample usage which use platform api.
|
||||
|
||||
## note
|
||||
Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
|
||||
The driver for the NPU is the VIPLITE driver
|
||||
|
||||
##requirements
|
||||
Vivante SDK >= 6.4.22
|
||||
ovxlib >= 1.2.26
|
||||
viplite >=2.0.0
|
||||
|
||||
## build
|
||||
cd build
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
|
||||
-DOVXLIB_LIB=${VIVANTE_NOKERNEL_SDK_DIR}/drivers/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON \
|
||||
-DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
|
||||
|
||||
## environment
|
||||
# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
|
||||
export VIV_GPU_FILE="/path/to/VIP9000NANOQ_PLUS_PID0X100000XX.config"
|
||||
|
||||
## run
|
||||
cd build
|
||||
./samples/platform_sample/platform_sample
|
||||
|
|
@ -26,8 +26,8 @@
|
|||
#include "tim/vx/graph.h"
|
||||
#include "tim/vx/ops.h"
|
||||
#include "tim/vx/types.h"
|
||||
#include "tim/vx/platform/native.h"
|
||||
#include "tim/vx/platform/lite/lite_native.h"
|
||||
#include "tim/vx/platform/platform.h"
|
||||
|
||||
|
||||
int main() {
|
||||
//construct tim-vx graph
|
||||
|
|
@ -49,9 +49,15 @@ int main() {
|
|||
std::vector<int> data_vec_i0({1, 2, 3, 4});
|
||||
std::vector<int> data_vec_i1({4, 3, 2, 1});
|
||||
|
||||
auto devices = tim::vx::platform::NativeDevice::Enumerate();
|
||||
auto devices = tim::vx::platform::IDevice::Enumerate();
|
||||
|
||||
std::cout << "NPU device count: " << devices.size() <<std::endl;
|
||||
auto device = devices[0];
|
||||
auto executor = std::make_shared<tim::vx::platform::LiteNativeExecutor>(device);
|
||||
//run 1 core in device 0
|
||||
std::cout << "NPU device[0] has " << device->CoreCount() << "cores" <<std::endl;
|
||||
auto use_core_count = -1;
|
||||
auto executor = device->CreateExecutor(use_core_count);
|
||||
|
||||
auto executable = executor->Compile(graph);
|
||||
auto input0_handle = executable->AllocateTensor(input_spec);
|
||||
auto input1_handle = executable->AllocateTensor(input_spec);
|
||||
|
|
@ -73,6 +79,10 @@ int main() {
|
|||
//each output value should be "5" in this demo
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
std::cout << "output value: " << data[i] << std::endl;
|
||||
if(data[i] != 5) {
|
||||
std::cout << "test failed" << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(data);
|
||||
return 0;
|
||||
|
|
@ -61,8 +61,10 @@ if(TIM_VX_ENABLE_PLATFORM)
|
|||
endif()
|
||||
list(APPEND LITE_EXTERNAL_LIBS
|
||||
${VIP_LITE_SDK}/drivers/libNBGlinker.so
|
||||
${VIP_LITE_SDK}/drivers/libVIPlite.so)
|
||||
list(APPEND LITE_INC_DIRS ${VIP_LITE_SDK}/include)
|
||||
${VIP_LITE_SDK}/drivers/libVIPhal.so)
|
||||
list(APPEND LITE_INC_DIRS
|
||||
${VIP_LITE_SDK}/include
|
||||
${VIP_LITE_SDK}/include/nbg_linker)
|
||||
endif()
|
||||
|
||||
if(TIM_VX_ENABLE_GRPC)
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@
|
|||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
|
|
|||
|
|
@ -9,3 +9,4 @@ DEF_NODE_TYPE(custom_sample)
|
|||
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
|
||||
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
|
||||
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
|
||||
DEF_NODE_TYPE(custom_letterbox)
|
||||
|
|
@ -9,3 +9,4 @@ DEF_OP(CUSTOM_SAMPLE)
|
|||
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
|
||||
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
|
||||
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
|
||||
DEF_OP(CUSTOM_LETTERBOX)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,61 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#ifndef _VSI_NN_OP_CUSTOM_LETTERBOX_H
|
||||
#define _VSI_NN_OP_CUSTOM_LETTERBOX_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_custom_letterbox_param
|
||||
{
|
||||
struct _custom_letterbox_local_data_t* local;
|
||||
int32_t new_shape_w;
|
||||
int32_t new_shape_h;
|
||||
vx_bool auto_bool;
|
||||
vx_bool scaleFill;
|
||||
vx_bool scaleup;
|
||||
int32_t stride;
|
||||
vx_bool center;
|
||||
float mean_r;
|
||||
float mean_g;
|
||||
float mean_b;
|
||||
float scale_r;
|
||||
float scale_g;
|
||||
float scale_b;
|
||||
int32_t pad_value_r;
|
||||
int32_t pad_value_g;
|
||||
int32_t pad_value_b;
|
||||
vx_bool reverse_channel;
|
||||
} vsi_nn_custom_letterbox_param;
|
||||
_compiler_assert(offsetof(vsi_nn_custom_letterbox_param, local) == 0, \
|
||||
vsi_nn_custom_lertterbox_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -34,5 +34,6 @@
|
|||
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
|
||||
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
|
||||
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
|
||||
#include "custom/ops/vsi_nn_op_custom_letterbox.h"
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -203,3 +203,4 @@ DEF_OP(BITCAST)
|
|||
DEF_OP(GROUPED_CONV3D)
|
||||
DEF_OP(COL2IM)
|
||||
DEF_OP(L1_LAYER_NORM)
|
||||
DEF_OP(ROPE)
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ typedef struct _vsi_nn_pre_process_rgb_param
|
|||
float g_scale;
|
||||
float b_scale;
|
||||
/* pre process rgb layer local data structure */
|
||||
vsi_nn_pre_process_rgb_lcl_data local;
|
||||
vsi_nn_pre_process_rgb_lcl_data *local;
|
||||
} vsi_nn_pre_process_rgb_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020-2023 Vivante Corporation
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
|
@ -21,38 +21,29 @@
|
|||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_
|
||||
#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_
|
||||
|
||||
#include "tim/vx/platform/native.h"
|
||||
#include "vip/virtual_device.h"
|
||||
#include "graph_private.h"
|
||||
#ifndef _VSI_NN_OP_ROPE_H
|
||||
#define _VSI_NN_OP_ROPE_H
|
||||
|
||||
namespace tim {
|
||||
namespace vx {
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
class GraphImpl;
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
namespace platform {
|
||||
typedef struct _vsi_nn_rope_param
|
||||
{
|
||||
struct _rope_local_data_t* local;
|
||||
// Add parameters here
|
||||
int32_t axis;
|
||||
vsi_bool interleaved;
|
||||
} vsi_nn_rope_param;
|
||||
_compiler_assert(offsetof(vsi_nn_rope_param, local) == 0, \
|
||||
vsi_nn_rope_h );
|
||||
|
||||
class NativeDeviceImpl : public NativeDevice {
|
||||
public:
|
||||
NativeDeviceImpl(device_id_t id);
|
||||
~NativeDeviceImpl(){};
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
bool Submit(const std::shared_ptr<tim::vx::Graph>& graph) override;
|
||||
bool Trigger(bool async = false, async_callback cb = NULL) override;
|
||||
bool DeviceExit() override;
|
||||
void WaitDeviceIdle() override;
|
||||
#endif
|
||||
|
||||
protected:
|
||||
std::unique_ptr<vip::IDevice> vip_device_;
|
||||
std::vector<vsi_nn_graph_t*> vsi_graph_v_;
|
||||
|
||||
};
|
||||
|
||||
} // namespace platform
|
||||
} // namespace vx
|
||||
} // namespace tim
|
||||
|
||||
#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/
|
||||
|
|
@ -34,6 +34,7 @@ typedef struct _vsi_nn_topk_param
|
|||
{
|
||||
uint32_t k;
|
||||
int32_t axis;
|
||||
struct _topk_local_data_t* local;
|
||||
} vsi_nn_topk_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -384,25 +384,17 @@ static VSI_INLINE_API float fp16_to_fp32
|
|||
|
||||
static VSI_INLINE_API float bfp16_to_fp32
|
||||
(
|
||||
int16_t in
|
||||
uint16_t in
|
||||
)
|
||||
{
|
||||
uint32_t t1, t2, t3;
|
||||
float out;
|
||||
fp32_bit_cast_t fp32_bit_cast;
|
||||
|
||||
t1 = in & 0x00FF; // Mantissa
|
||||
t2 = in & 0xFF00; // Sign bit + Exponent
|
||||
t3 = in & 0x7F00; // Exponent
|
||||
fp32_bit_cast.data = (uint32_t)(in << 16);
|
||||
|
||||
t1 <<= 16;
|
||||
t2 <<= 16; // Shift (sign + Exponent) bit into position
|
||||
t1 |= t2; // Re-insert (sign + Exponent) bit
|
||||
|
||||
fp32_bit_cast.data = t1;
|
||||
out = fp32_bit_cast.val;
|
||||
|
||||
return t3 == 0 ? 0.0f : out;
|
||||
return out;
|
||||
} /* bfp16_to_fp32() */
|
||||
|
||||
static VSI_INLINE_API uint16_t fp32_to_fp16
|
||||
|
|
@ -720,7 +712,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32
|
|||
*dst = fp16_to_fp32( *(int16_t *)src );
|
||||
break;
|
||||
case VSI_NN_TYPE_BFLOAT16:
|
||||
*dst = bfp16_to_fp32( *(int16_t *)src );
|
||||
*dst = bfp16_to_fp32( *(uint16_t *)src );
|
||||
break;
|
||||
case VSI_NN_TYPE_FLOAT8_E4M3:
|
||||
*dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -61,14 +61,13 @@ typedef struct _vsi_nn_hw_config_t
|
|||
{
|
||||
char target_name[VSI_NN_MAX_TARGET_NAME];
|
||||
vsi_nn_hw_evis_t evis;
|
||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||
uint32_t subGroupSize;
|
||||
#endif
|
||||
uint32_t use_40bits_va;
|
||||
uint32_t support_stream_processor;
|
||||
uint32_t sp_exec_count;
|
||||
uint32_t sp_vector_depth;
|
||||
uint32_t sp_per_core_vector_depth;
|
||||
uint32_t support_ffd;
|
||||
} vsi_nn_hw_config_t;
|
||||
|
||||
typedef struct _vsi_nn_runtime_option_t
|
||||
|
|
@ -89,6 +88,7 @@ typedef struct _vsi_nn_runtime_option_t
|
|||
int32_t enable_save_file_type;
|
||||
int32_t enable_use_image_process;
|
||||
int32_t enable_use_from_handle;
|
||||
vsi_nn_hw_config_t config;
|
||||
} vsi_nn_runtime_option_t;
|
||||
|
||||
/**
|
||||
|
|
@ -101,6 +101,15 @@ typedef struct _vsi_nn_context_t
|
|||
vsi_nn_runtime_option_t options;
|
||||
} VSI_PUBLIC_TYPE *vsi_nn_context_t;
|
||||
|
||||
/**
|
||||
* Query and set options->config hw params.
|
||||
*/
|
||||
OVXLIB_API vsi_status query_hardware_caps_runtime
|
||||
(
|
||||
vsi_nn_context_t ctx,
|
||||
vsi_nn_runtime_option_t *options
|
||||
);
|
||||
|
||||
/**
|
||||
* Create context
|
||||
* Create ovxlib NN runtime context.
|
||||
|
|
@ -113,6 +122,11 @@ OVXLIB_API vsi_status vsi_nn_initOptions
|
|||
(
|
||||
vsi_nn_runtime_option_t *options
|
||||
);
|
||||
OVXLIB_API vsi_status vsi_nn_initOptions_runtime
|
||||
(
|
||||
vsi_nn_runtime_option_t *options,
|
||||
vsi_nn_context_t ctx
|
||||
);
|
||||
/**
|
||||
* Release context
|
||||
* Release ovxlib NN runtime resource and reset context handle to NULL.
|
||||
|
|
|
|||
|
|
@ -57,5 +57,8 @@
|
|||
#define VSI_PER_GROUP_QUANTIZATION_SUPPORT
|
||||
#endif
|
||||
#define VSI_GRAPH_RUNTIME_ENV_SUPPORT
|
||||
#if defined(VX_TENSOR_SPARSITY_SUPPORT)
|
||||
#define VSI_TENSOR_SPARSITY_SUPPORT
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -216,6 +216,7 @@
|
|||
#include "ops/vsi_nn_op_grouped_conv3d.h"
|
||||
#include "ops/vsi_nn_op_col2im.h"
|
||||
#include "ops/vsi_nn_op_l1_layer_norm.h"
|
||||
#include "ops/vsi_nn_op_rope.h"
|
||||
/* custom node head define define */
|
||||
#include "custom/vsi_nn_custom_node_type.h"
|
||||
#include "ops/vsi_nn_op_inverse_sigmoid.h"
|
||||
|
|
@ -420,6 +421,7 @@ typedef union _vsi_nn_nn_param
|
|||
vsi_nn_grouped_conv3d_param grouped_conv3d;
|
||||
vsi_nn_col2im_param col2im;
|
||||
vsi_nn_l1_layer_norm_param l1_layer_norm;
|
||||
vsi_nn_rope_param rope;
|
||||
void* client_param;
|
||||
|
||||
/* custom node data struct define */
|
||||
|
|
|
|||
|
|
@ -86,8 +86,10 @@ typedef enum
|
|||
VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
|
||||
/** perchannel float8 */
|
||||
VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
|
||||
/** GPQT */
|
||||
/** pergroup symmetric */
|
||||
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
|
||||
/** pergroup asymmetric */
|
||||
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC = 0x9,
|
||||
/** undefined type */
|
||||
VSI_NN_QNT_TYPE_NA = 0xff,
|
||||
} vsi_nn_qnt_type_e;
|
||||
|
|
|
|||
|
|
@ -418,6 +418,34 @@ OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar
|
|||
int8_t is_scalar
|
||||
);
|
||||
|
||||
/**
|
||||
* Get Tensor is_scalar
|
||||
* Get the is_sparsity of the tensor
|
||||
*
|
||||
* @param[in] tensor Tensor.
|
||||
*
|
||||
* @return is_sparsity flag of the tensor.
|
||||
*/
|
||||
OVXLIB_API int32_t vsi_nn_GetTensorIsSparsity
|
||||
(
|
||||
vsi_nn_tensor_t* tensor
|
||||
);
|
||||
|
||||
/**
|
||||
* Set Weight Tensor whether is sparsity
|
||||
* Set the is_sparsity for the tensor
|
||||
*
|
||||
* @param[in] tensor Tensor.
|
||||
* @param[in] new is_sparsity value of the tensor.
|
||||
*
|
||||
* @return VSI_SUCCESS on success, or error core otherwise.
|
||||
**/
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_SetTensorIsSparsity(
|
||||
vsi_nn_tensor_t* tensor,
|
||||
int32_t is_sparsity
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ extern "C"{
|
|||
|
||||
#define VSI_NN_VERSION_MAJOR 1
|
||||
#define VSI_NN_VERSION_MINOR 2
|
||||
#define VSI_NN_VERSION_PATCH 14
|
||||
#define VSI_NN_VERSION_PATCH 22
|
||||
#define VSI_NN_VERSION \
|
||||
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,475 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_dtype_util_prv.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
|
||||
#define _CUSTOM_LETTERBOX_KERNEL_SOURCE "custom_letterbox"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
(( IN_DTYPE ) | ( OUT_DTYPE << 8 ))
|
||||
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("evis.custom_letterbox_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||
_CUSTOM_LETTERBOX_KERNEL_SOURCE }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _custom_letterbox_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( U8, U8 ),
|
||||
PACK_KERNEL_MAP( U8, I8 ),
|
||||
PACK_KERNEL_MAP( U8, F16 ),
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _custom_letterbox_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
#define _CUSTOM_LETTERBOX_PARAM_NUM _cnt_of_array( _custom_letterbox_kernel_param_def )
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_custom_letterbox_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
2,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||
VSI_UNREFERENCED(param_size);
|
||||
int32_t top = 0;
|
||||
int32_t bottom = 0;
|
||||
int32_t left = 0;
|
||||
int32_t right = 0;
|
||||
float scale_w = 0;
|
||||
float scale_h = 0;
|
||||
int32_t resize_w = 0;
|
||||
int32_t resize_h = 0;
|
||||
int32_t resize_max_w = 0;
|
||||
int32_t resize_max_h = 0;
|
||||
float output_scale = 1.0f;
|
||||
float output_zp = 0;
|
||||
float out_scale_r = 0;
|
||||
float out_zp_r = 0;
|
||||
float out_scale_g = 0;
|
||||
float out_zp_g = 0;
|
||||
float out_scale_b = 0;
|
||||
float out_zp_b = 0;
|
||||
float pad_v_r = 0;
|
||||
float pad_v_g = 0;
|
||||
float pad_v_b = 0;
|
||||
int32_t in_width = 0;
|
||||
int32_t in_height = 0;
|
||||
int32_t out_width = 0;
|
||||
int32_t out_height = 0;
|
||||
float mean_r = 0;
|
||||
float mean_g = 0;
|
||||
float mean_b = 0;
|
||||
float scale_r = 0;
|
||||
float scale_g = 0;
|
||||
float scale_b = 0;
|
||||
vx_int32 pad_value_r = 0;
|
||||
vx_int32 pad_value_g = 0;
|
||||
vx_int32 pad_value_b = 0;
|
||||
vx_int32 r_order = 0;
|
||||
vx_int32 b_order = 0;
|
||||
vx_int32 reverse_channel = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &top);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &bottom);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &left);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &right);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &mean_r);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &mean_g);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &mean_b);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &scale_r);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &scale_g);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[11], &scale_b);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_value_r);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_value_g);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &pad_value_b);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &reverse_channel);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
in_width = (int32_t)attr[0]->shape->data[0] / 3;
|
||||
in_height = (int32_t)attr[0]->shape->data[1];
|
||||
out_width = (int32_t)attr[1]->shape->data[0];
|
||||
out_height = (int32_t)attr[1]->shape->data[1] / 3;
|
||||
|
||||
output_scale = 1.0f / attr[1]->scale;
|
||||
output_zp = (float)(attr[1]->zero_point);
|
||||
|
||||
resize_w = out_width - left - right;
|
||||
resize_h = out_height - top - bottom;
|
||||
resize_max_w = out_width - right;
|
||||
resize_max_h = out_height - bottom;
|
||||
scale_w = (float)in_width / resize_w;
|
||||
scale_h = (float)in_height / resize_h;
|
||||
out_scale_r = scale_r / output_scale;
|
||||
out_zp_r = output_zp - out_scale_r * mean_r;
|
||||
out_scale_g = scale_g / output_scale;
|
||||
out_zp_g = output_zp - out_scale_g * mean_g;
|
||||
out_scale_b = scale_b / output_scale;
|
||||
out_zp_b = output_zp - out_scale_b * mean_b;
|
||||
pad_v_r = pad_value_r * out_scale_r + out_zp_r;
|
||||
pad_v_g = pad_value_g * out_scale_g + out_zp_g;
|
||||
pad_v_b = pad_value_b * out_scale_b + out_zp_b;
|
||||
|
||||
if (reverse_channel)
|
||||
{
|
||||
r_order = out_height * 2;
|
||||
b_order = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
r_order = 0;
|
||||
b_order = out_height * 2;
|
||||
}
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
|
||||
0x00090909, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00140003, 0x00000025, // ABin
|
||||
0x000a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniLeftToFloat32_4x4 = {{
|
||||
0x00010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00000002, // ABin
|
||||
0x00020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtactHalf8_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtract8Data_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFloat32_4x4", &uniLeftToFloat32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8 );
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "top", &top );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "left", &left );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_r", &out_scale_r );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_g", &out_scale_g );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_b", &out_scale_b );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_r", &out_zp_r );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_g", &out_zp_g );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_b", &out_zp_b );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_r", &pad_v_r );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_g", &pad_v_g );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_b", &pad_v_b );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "scale_w", &scale_w );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "scale_h", &scale_h );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_w", &resize_max_w );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_h", &resize_max_h );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "r_order", &r_order );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "b_order", &b_order );
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_size[0] = out_width;
|
||||
gpu_param.global_size[1] = out_height;
|
||||
|
||||
status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _custom_warp_affine_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _custom_letterbox_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _custom_letterbox_kernel_map );
|
||||
vx_param_description_t * param_def = _custom_letterbox_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _custom_letterbox_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _custom_letterbox_initializer;
|
||||
uint32_t key = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = CUSTOM_LETTERBOX_HASH_KEY( in_dtype, out_dtype );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (vx_uint32)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_CUSTOM_LETTERBOX_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
size_t i = 0;
|
||||
|
||||
int32_t top = vsi_nn_kernel_param_get_int32( params, "top");
|
||||
int32_t bottom = vsi_nn_kernel_param_get_int32( params, "bottom");
|
||||
int32_t left = vsi_nn_kernel_param_get_int32( params, "left");
|
||||
int32_t right = vsi_nn_kernel_param_get_int32( params, "right");
|
||||
float mean_r = vsi_nn_kernel_param_get_float32( params, "mean_r");
|
||||
float mean_g = vsi_nn_kernel_param_get_float32( params, "mean_g");
|
||||
float mean_b = vsi_nn_kernel_param_get_float32( params, "mean_b");
|
||||
float scale_r = vsi_nn_kernel_param_get_float32( params, "scale_r");
|
||||
float scale_g = vsi_nn_kernel_param_get_float32( params, "scale_g");
|
||||
float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b");
|
||||
int32_t pad_value_r = vsi_nn_kernel_param_get_int32( params, "pad_value_r");
|
||||
int32_t pad_value_g = vsi_nn_kernel_param_get_int32( params, "pad_value_g");
|
||||
int32_t pad_value_b = vsi_nn_kernel_param_get_int32( params, "pad_value_b");
|
||||
int32_t reverse_channel = vsi_nn_kernel_param_get_int32( params, "reverse_channel");
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
|
||||
|
||||
uint32_t param_num = _CUSTOM_LETTERBOX_PARAM_NUM;
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
shapes[0][0] = inputs[0]->attr.size[1] * 3;
|
||||
shapes[0][1] = inputs[0]->attr.size[2];
|
||||
shapes[1][0] = outputs[0]->attr.size[0];
|
||||
shapes[1][1] = outputs[0]->attr.size[1] * 3;
|
||||
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], 2 );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[1], 2 );
|
||||
|
||||
if (reshape_tensors[0] == NULL ||
|
||||
reshape_tensors[1] == NULL)
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
if (reverse_channel)
|
||||
{
|
||||
float mean_temp = mean_r;
|
||||
float scale_temp = scale_r;
|
||||
int32_t pad_value_temp = pad_value_r;
|
||||
mean_r = mean_b;
|
||||
mean_b = mean_temp;
|
||||
scale_r = scale_b;
|
||||
scale_b = scale_temp;
|
||||
pad_value_r = pad_value_b;
|
||||
pad_value_b = pad_value_temp;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 2;
|
||||
|
||||
vsi_nn_kernel_node_pack_io( node_params, param_num,
|
||||
reshape_tensors, 1, &reshape_tensors[1], 1 );
|
||||
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bottom );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &right );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_r );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_g );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_b );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_r );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_g );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_r );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_g );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_b );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse_channel );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
|
||||
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[12] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[13] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[14] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[15] );
|
||||
|
||||
CHECK_STATUS(status);
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( custom_letterbox, _setup )
|
||||
|
|
@ -35,6 +35,7 @@
|
|||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
#define _CPU_ARG_NUM (1)
|
||||
#define _CPU_INPUT_NUM (1)
|
||||
|
|
@ -42,6 +43,7 @@
|
|||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC")
|
||||
#define _KERNEL_NAME_U8 ("com.vivantecorp.extension.Softmax2VXC_u8")
|
||||
|
||||
#define SCALAR_INPUT_AXIS (2)
|
||||
|
||||
|
|
@ -64,7 +66,11 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
|
|||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
int sf_size = 0;
|
||||
vsi_nn_kernel_tensor_attr_t* attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||
float srcZP = 0.0f;
|
||||
float srcScale = 1.0f;
|
||||
float dstZP = 0.0f;
|
||||
float dstScale = 1.0f;
|
||||
// Alignment with a power of two value.
|
||||
gpu_param_t gpu_param = {
|
||||
2, // workdim
|
||||
|
|
@ -75,14 +81,19 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
|
|||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
if (!attr)
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
|
||||
if ((!attr[0]) || (!attr[1]))
|
||||
{
|
||||
VSILOGE("Query failure! at line");
|
||||
return status;
|
||||
}
|
||||
|
||||
sf_size = (int)attr->shape->data[0];
|
||||
sf_size = (int)attr[0]->shape->data[0];
|
||||
srcScale = attr[0]->scale;
|
||||
srcZP = (float)attr[0]->zero_point;
|
||||
dstScale = 1.0f / attr[1]->scale;
|
||||
dstZP = (float)attr[1]->zero_point;
|
||||
|
||||
gpu_param.global_offset[0] = 0;
|
||||
gpu_param.global_offset[1] = 0;
|
||||
|
|
@ -91,7 +102,7 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
|
|||
gpu_param.local_size[0] = 1;
|
||||
gpu_param.local_size[1] = 1;
|
||||
gpu_param.global_size[0] =
|
||||
gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
|
||||
gpu_align_p2((attr[0]->shape->data[1] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
|
||||
gpu_param.local_size[0]);
|
||||
gpu_param.global_size[1] =
|
||||
gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
|
||||
|
|
@ -107,25 +118,50 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
|
|||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtract8Bin_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
|
||||
vsi_nn_kernel_gpu_add_param(node,
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtract8Bin_2x8", &uniExtract8Bin_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"sf_size", &sf_size);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "srcScale", &srcScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "srcZP", &srcZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "dstScale", &dstScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "dstZP", &dstZP);
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
if(status != VSI_SUCCESS)
|
||||
{
|
||||
VSILOGE("Initializer failure!");
|
||||
}
|
||||
if (attr) vsi_nn_kernel_tensor_attr_release( &attr );
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static const vx_kernel_description_t _kernel_info =
|
||||
static const vx_kernel_description_t _kernel_info1 =
|
||||
{
|
||||
KERNEL_ID_PLACEHOLDER,
|
||||
_KERNEL_NAME,
|
||||
|
|
@ -139,6 +175,20 @@ static const vx_kernel_description_t _kernel_info =
|
|||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
static const vx_kernel_description_t _kernel_info2 =
|
||||
{
|
||||
KERNEL_ID_PLACEHOLDER,
|
||||
_KERNEL_NAME_U8,
|
||||
NULL,
|
||||
kernel_param_def,
|
||||
_cnt_of_array( kernel_param_def ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
_softmax_initializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
|
|
@ -146,9 +196,20 @@ static vsi_status _query_kernel
|
|||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
VSI_UNREFERENCED(inputs);
|
||||
VSI_UNREFERENCED(outputs);
|
||||
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
|
||||
out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
|
||||
|
||||
if (in_dtype == U8 && out_dtype == U8)
|
||||
{
|
||||
memmove( &kernel->info, &_kernel_info2, sizeof(vx_kernel_description_t) );
|
||||
}
|
||||
else
|
||||
{
|
||||
memmove( &kernel->info, &_kernel_info1, sizeof(vx_kernel_description_t) );
|
||||
}
|
||||
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
|
|
@ -173,12 +234,42 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t axis = 0;
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = {NULL};
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
uint32_t rank_in = 0;
|
||||
int32_t new_axis = 0;
|
||||
uint32_t i = 0;
|
||||
vsi_bool ret = vx_false_e;
|
||||
|
||||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
|
||||
ret = vsi_nn_kernel_optimize_softmax_shape(inputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num,
|
||||
axis,
|
||||
shapes[0],
|
||||
&rank_in,
|
||||
&new_axis);
|
||||
|
||||
if (ret)
|
||||
{
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], rank_in);
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[0], rank_in);
|
||||
}
|
||||
else
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!vsi_nn_kernel_gpu_check_shape(reshape_tensors[0]->attr.size,
|
||||
reshape_tensors[0]->attr.dim_num) ||
|
||||
new_axis > 2)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel );
|
||||
if( VSI_SUCCESS == status)
|
||||
{
|
||||
|
|
@ -187,9 +278,9 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
|
||||
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
|
||||
reshape_tensors, _CPU_INPUT_NUM, &reshape_tensors[1], _CPU_OUTPUT_NUM );
|
||||
backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &axis );
|
||||
graph, I32, &new_axis );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
||||
|
|
@ -200,6 +291,11 @@ static vsi_nn_kernel_node_t _setup
|
|||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,227 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_ops.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "vsi_nn_internal_node.h"
|
||||
#include "utils/vsi_nn_constraint_check.h"
|
||||
|
||||
typedef struct _custom_letterbox_local_data_t {
|
||||
int32_t placeholder;
|
||||
} custom_letterbox_local_data_t;
|
||||
|
||||
/*
|
||||
Declare number of input and output.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
|
||||
int32_t my_round(float in)
|
||||
{
|
||||
if (in >= 0)
|
||||
{
|
||||
return (int)(in + 0.5f);
|
||||
}
|
||||
else
|
||||
{
|
||||
return (int)(in - 0.5f);
|
||||
}
|
||||
}
|
||||
|
||||
static vsi_status op_compute
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_param_t * param = NULL;
|
||||
vsi_nn_custom_letterbox_param * p;
|
||||
p = &(self->nn_param.custom_letterbox);
|
||||
int32_t shape_w = (int32_t)inputs[0]->attr.size[1];
|
||||
int32_t shape_h = (int32_t)inputs[0]->attr.size[2];
|
||||
int32_t new_shape_w = (int32_t)outputs[0]->attr.size[0];
|
||||
int32_t new_shape_h = (int32_t)outputs[0]->attr.size[1];
|
||||
vx_bool auto_bool = p->auto_bool;
|
||||
vx_bool scaleFill = p->scaleFill;
|
||||
vx_bool scaleup = p->scaleup;
|
||||
int32_t stride = p->stride;
|
||||
vx_bool center = p->center;
|
||||
|
||||
float r = 1.0f;
|
||||
int32_t new_unpad_w = 0;
|
||||
int32_t new_unpad_h = 0;
|
||||
int32_t dw = 0;
|
||||
int32_t dh = 0;
|
||||
int32_t top = 0;
|
||||
int32_t bottom = 0;
|
||||
int32_t left = 0;
|
||||
int32_t right = 0;
|
||||
|
||||
r = (float)fmin((float)new_shape_w / shape_w, (float)new_shape_h / shape_h);
|
||||
if (!scaleup)
|
||||
{
|
||||
r = (float)fmin(r, 1.0f);
|
||||
}
|
||||
|
||||
new_unpad_w = my_round(r * shape_w);
|
||||
new_unpad_h = my_round(r * shape_h);
|
||||
dw = new_shape_w - new_unpad_w;
|
||||
dh = new_shape_h - new_unpad_h;
|
||||
if (auto_bool)
|
||||
{
|
||||
dw = dw % stride;
|
||||
dh = dh % stride;
|
||||
}
|
||||
else if (scaleFill)
|
||||
{
|
||||
dw = 0;
|
||||
dh = 0;
|
||||
new_unpad_w = new_shape_w;
|
||||
new_unpad_h = new_shape_h;
|
||||
}
|
||||
if (center)
|
||||
{
|
||||
top = my_round(dh / 2.0f - 0.1f);
|
||||
bottom = my_round(dh / 2.0f + 0.1f);
|
||||
left = my_round(dw / 2.0f - 0.1f);
|
||||
right = my_round(dw / 2.0f + 0.1f);
|
||||
}
|
||||
else
|
||||
{
|
||||
top = 0;
|
||||
bottom = my_round(dh + 0.1f);
|
||||
left = 0;
|
||||
right = my_round(dw + 0.1f);
|
||||
}
|
||||
|
||||
param = vsi_nn_kernel_param_create();
|
||||
vsi_nn_kernel_param_add_int32( param, "top", top);
|
||||
vsi_nn_kernel_param_add_int32( param, "bottom", bottom);
|
||||
vsi_nn_kernel_param_add_int32( param, "left", left);
|
||||
vsi_nn_kernel_param_add_int32( param, "right", right);
|
||||
vsi_nn_kernel_param_add_float32( param, "mean_r", p->mean_r);
|
||||
vsi_nn_kernel_param_add_float32( param, "mean_g", p->mean_g);
|
||||
vsi_nn_kernel_param_add_float32( param, "mean_b", p->mean_b);
|
||||
vsi_nn_kernel_param_add_float32( param, "scale_r", p->scale_r);
|
||||
vsi_nn_kernel_param_add_float32( param, "scale_g", p->scale_g);
|
||||
vsi_nn_kernel_param_add_float32( param, "scale_b", p->scale_b);
|
||||
vsi_nn_kernel_param_add_int32( param, "pad_value_r", p->pad_value_r);
|
||||
vsi_nn_kernel_param_add_int32( param, "pad_value_g", p->pad_value_g);
|
||||
vsi_nn_kernel_param_add_int32( param, "pad_value_b", p->pad_value_b);
|
||||
vsi_nn_kernel_param_add_int32( param, "reverse_channel", p->reverse_channel);
|
||||
|
||||
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
|
||||
"custom_letterbox",
|
||||
inputs, 1,
|
||||
outputs, 1, param );
|
||||
|
||||
vsi_nn_kernel_param_release( ¶m );
|
||||
|
||||
return VSI_SUCCESS;
|
||||
} /* op_compute() */
|
||||
|
||||
static vsi_bool op_check
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
BEGIN_IO_TYPE_DECL(LETTERBOX, 1, 1)
|
||||
IO_TYPE(D_U8, D_F16)
|
||||
IO_TYPE(D_U8, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_U8, D_I8|Q_DFP)
|
||||
IO_TYPE(D_U8, D_I8|Q_ASYM)
|
||||
IO_TYPE(D_U8, D_I8|Q_SYM)
|
||||
END_IO_TYPE_DECL(LETTERBOX)
|
||||
if (!VALIDATE_OP_IO_TYPES(LETTERBOX, self, inputs, self->input.num, outputs, self->output.num)) {
|
||||
char* desc = generate_op_io_types_desc(inputs,
|
||||
self->input.num, outputs, self->output.num);
|
||||
VSILOGE("Inputs/Outputs data type not support: %s", desc);
|
||||
destroy_op_io_types_desc(desc);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
} /* op_check() */
|
||||
|
||||
static vsi_bool op_setup
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||
{
|
||||
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
|
||||
outputs[0]->attr.size[0] = self->nn_param.custom_letterbox.new_shape_w;
|
||||
outputs[0]->attr.size[1] = self->nn_param.custom_letterbox.new_shape_h;
|
||||
outputs[0]->attr.size[2] = 3;
|
||||
outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
} /* op_setup() */
|
||||
|
||||
static vsi_status op_deinit
|
||||
(
|
||||
vsi_nn_node_t* self
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
|
||||
status = vsi_nn_op_common_deinit(self);
|
||||
|
||||
return status;
|
||||
} /* op_deinit() */
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/* Registrar */
|
||||
DEF_OP_REG
|
||||
(
|
||||
/* op_name */ CUSTOM_LETTERBOX,
|
||||
/* init */ NULL,
|
||||
/* compute */ op_compute,
|
||||
/* deinit */ op_deinit,
|
||||
/* check */ op_check,
|
||||
/* setup */ op_setup,
|
||||
/* optimize */ NULL,
|
||||
/* input_num */ _INPUT_NUM,
|
||||
/* output_num */ _OUTPUT_NUM
|
||||
);
|
||||
|
||||
__END_DECLS
|
||||
|
|
@ -85,18 +85,24 @@ static const struct {
|
|||
HASH_CUMSUM_KERNELS(0, U8, U8)
|
||||
HASH_CUMSUM_KERNELS(0, F32, F32)
|
||||
HASH_CUMSUM_KERNELS(0, F32, U8)
|
||||
HASH_CUMSUM_KERNELS(0, I32, I32)
|
||||
HASH_CUMSUM_KERNELS(1, U8, U8)
|
||||
HASH_CUMSUM_KERNELS(1, F32, F32)
|
||||
HASH_CUMSUM_KERNELS(1, F32, U8)
|
||||
HASH_CUMSUM_KERNELS(1, I32, I32)
|
||||
HASH_CUMSUM_KERNELS(2, U8, U8)
|
||||
HASH_CUMSUM_KERNELS(2, F32, F32)
|
||||
HASH_CUMSUM_KERNELS(2, F32, U8)
|
||||
HASH_CUMSUM_KERNELS(2, I32, I32)
|
||||
|
||||
HASH_CUMSUM_KERNELS_2D(0, U8, U8)
|
||||
HASH_CUMSUM_KERNELS_2D(0, F32, F32)
|
||||
HASH_CUMSUM_KERNELS_2D(0, F32, U8)
|
||||
HASH_CUMSUM_KERNELS_2D(0, I32, I32)
|
||||
HASH_CUMSUM_KERNELS_2D(1, U8, U8)
|
||||
HASH_CUMSUM_KERNELS_2D(1, F32, F32)
|
||||
HASH_CUMSUM_KERNELS_2D(1, F32, U8)
|
||||
HASH_CUMSUM_KERNELS_2D(1, I32, I32)
|
||||
|
||||
HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_3)
|
||||
HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
|
|
@ -644,7 +645,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||
shader_cnt_support =
|
||||
(graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
|
||||
(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >= 64 &&
|
||||
((vsi_nn_graph_prv_t*)graph)->options->config.use_40bits_va) ? TRUE : FALSE;
|
||||
#endif
|
||||
if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -75,6 +75,7 @@ static const _kernel_map_type _one_hot_kernel_map[] =
|
|||
PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
|
||||
PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
|
||||
PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
|
||||
PACK_ONE_HOT_KERNEL_MAP( I32, BF16 ),
|
||||
PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
|
||||
PACK_ONE_HOT_KERNEL_MAP( U8, U8 ),
|
||||
};
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ static const struct {
|
|||
const char* source_name;
|
||||
} kernel_map[] =
|
||||
{
|
||||
PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
|
||||
PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
|
||||
PRELU_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
|
||||
PRELU_KERNELS(U8, U8, U8, KERNEL_SOURCE_1)
|
||||
PRELU_KERNELS(I32, I32, I32, KERNEL_SOURCE_1)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,329 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_ROPE,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define _ROPE_KERNEL_SOURCE "rope"
|
||||
#define _ROPE_KERNEL_NAME CVIVANTE_NAMESPACE("cl.rope")
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define STR(a) #a
|
||||
#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
|
||||
((IN0_DTYPE) | (IN0_DTYPE << 8) | (OUT_DTYPE << 16) | (AXIS << 25))
|
||||
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
|
||||
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ), \
|
||||
CVIVANTE_NAMESPACE("cl.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_axis"STR(AXIS)), \
|
||||
"rope_0" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _rope_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( F32, F32, F32, 0 ),
|
||||
PACK_KERNEL_MAP( F32, F32, F32, 1 ),
|
||||
PACK_KERNEL_MAP( F32, F32, F32, 2 ),
|
||||
PACK_KERNEL_MAP( I32, I32, I32, 0 ),
|
||||
PACK_KERNEL_MAP( I32, I32, I32, 1 ),
|
||||
PACK_KERNEL_MAP( I32, I32, I32, 2 ),
|
||||
PACK_KERNEL_MAP( U32, U32, U32, 0 ),
|
||||
PACK_KERNEL_MAP( U32, U32, U32, 1 ),
|
||||
PACK_KERNEL_MAP( U32, U32, U32, 2 ),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _rope_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def )
|
||||
#define SCALAR_AXIS (4)
|
||||
#define SCALAR_IN_ZP (5)
|
||||
#define SCALAR_COS_ZP (6)
|
||||
#define SCALAR_SIN_ZP (7)
|
||||
#define SCALAR_SCALE0 (8)
|
||||
#define SCALAR_SCALE1 (9)
|
||||
#define SCALAR_OUT_ZP (10)
|
||||
#define SCALAR_HALF_HEAD_SIZE (11)
|
||||
#define SCALAR_STEP (12)
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_rope_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0} // globalWorkSize: image size in thread
|
||||
};
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = { NULL };
|
||||
int32_t axis = 0;
|
||||
vsi_size_array_t* out_shape = NULL;
|
||||
vsi_size_t shape[3] = { 1 };
|
||||
|
||||
VSI_UNREFERENCED(node);
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
shape[0] = out_shape->data[0];
|
||||
shape[1] = out_shape->data[1];
|
||||
shape[2] = out_shape->data[2];
|
||||
shape[axis] = shape[axis] / 2;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = shape[0];
|
||||
gpu_param.global_size[1] = shape[1];
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? shape[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(attr[0]);
|
||||
SAFE_FREE_TENSOR_ATTR(attr[1]);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
|
||||
return status;
|
||||
} /* _rope_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t axis
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e in2_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _rope_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _rope_kernel_map );
|
||||
vx_param_description_t * param_def = _rope_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _rope_initializer;
|
||||
|
||||
uint32_t key = 0;
|
||||
uint32_t i;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
|
||||
in2_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
#define _PACK_SELECT_KEY( in0_type, in1_type, in2_type, out_type ) \
|
||||
((in0_type) | (in1_type << 8) | (in2_type << 16) | (out_type << 24))
|
||||
switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype))
|
||||
{
|
||||
case _PACK_SELECT_KEY(F32, F32, F32, F32):
|
||||
case _PACK_SELECT_KEY(F16, F16, F16, F16):
|
||||
key = ROPE_HASH_KEY(F32, F32, F32, axis);
|
||||
break;
|
||||
case _PACK_SELECT_KEY(U8, U8, U8, U8):
|
||||
case _PACK_SELECT_KEY(U16, U16, U16, U16):
|
||||
key = ROPE_HASH_KEY(U32, U32, U32, axis);
|
||||
break;
|
||||
case _PACK_SELECT_KEY(I8, I8, I8, I8):
|
||||
case _PACK_SELECT_KEY(I16, I16, I16, I16):
|
||||
case _PACK_SELECT_KEY(I32, I32, I32, I32):
|
||||
key = ROPE_HASH_KEY(I32, I32, I32, axis);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#undef _PACK_SELECT_KEY
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
int32_t interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
|
||||
float in_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float cos_scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float sin_scale = vsi_nn_get_tensor_scale(inputs[2]);
|
||||
float out_scale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float in_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float cos_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
|
||||
float sin_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
int32_t half_head_size = interleaved ? 1 : (int32_t)(inputs[0]->attr.size[axis] / 2);
|
||||
float scale0 = in_scale * cos_scale / out_scale;
|
||||
float scale1 = in_scale * sin_scale / out_scale;
|
||||
int32_t step = interleaved ? 2 : 1;
|
||||
int32_t i = 0;
|
||||
|
||||
// Check if gpu can support the size
|
||||
if ( !vsi_nn_kernel_gpu_check_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, axis );
|
||||
if (VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &axis);
|
||||
node_params[SCALAR_IN_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &in_zp);
|
||||
node_params[SCALAR_COS_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &cos_zp);
|
||||
node_params[SCALAR_SIN_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &sin_zp);
|
||||
node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &scale0);
|
||||
node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &scale1);
|
||||
node_params[SCALAR_OUT_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &output_zp);
|
||||
node_params[SCALAR_HALF_HEAD_SIZE] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &half_head_size);
|
||||
node_params[SCALAR_STEP] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &step);
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
for (i = SCALAR_AXIS; i < (int32_t)_ROPE_PARAM_NUM; i++)
|
||||
{
|
||||
if (node_params[i])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release(&node_params[i]);
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( rope, _setup )
|
||||
|
||||
|
|
@ -27,6 +27,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
|
|
@ -299,7 +300,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
#if (VX_ACTIVATION_EXT_SUPPORT)
|
||||
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
|
||||
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
|
|
@ -457,7 +458,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_bool is_odd_even_sort = FALSE;
|
||||
vsi_bool is_bitnoic_segment = FALSE;
|
||||
size_t param_num = _TOPK_PARAM_NUM;
|
||||
int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
|
||||
int32_t max_stages = 7 + (int32_t)log2(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >> 2);
|
||||
vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
|
|
@ -483,6 +484,11 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (block_size >= GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
shape[0][0] = block_size;
|
||||
shape[0][1] = block_num;
|
||||
shape[1][0] = top_k;
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
|
|
@ -192,7 +193,7 @@ static vsi_bool _bucketize_support_types
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2)
|
||||
if (in_dtype == F16 && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver != VSI_NN_HW_EVIS_2)
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
|
|
@ -771,7 +772,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
temp_tensor[1] = weights;
|
||||
temp_tensor[2] = biases;
|
||||
|
||||
ks = get_kernel_size(weights->attr.size[0], dilation, stride, graph->ctx->config.evis.ver);
|
||||
ks = get_kernel_size(weights->attr.size[0], dilation, stride,
|
||||
((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver);
|
||||
|
||||
status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);
|
||||
|
||||
|
|
|
|||
|
|
@ -121,7 +121,9 @@ static const _kernel_map_type _groupnorm_sums_kernel_map[] =
|
|||
TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 )
|
||||
TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 )
|
||||
TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SUMS_KERNELS( U16, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SUMS_KERNELS_2D( U16, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
|
||||
};
|
||||
|
|
@ -174,6 +176,9 @@ static const _kernel_map_type _groupnorm_kernel_map[] =
|
|||
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 )
|
||||
|
||||
TENSOR_GROUPNORM_SCALE_KERNELS( U16, F32, U16, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SCALE_KERNELS_2D( U16, F32, U16, KERNEL_SOURCE_2 )
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -245,6 +250,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
|
|||
float sum_x2_tail0 = 1;
|
||||
float sum_x2_tail1 = 1;
|
||||
float work_item_pixels = 1;
|
||||
vsi_bool is_input_8bits = FALSE;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
|
|
@ -263,12 +269,13 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
|
|||
width = (int32_t)(input_shape->data[0]);
|
||||
height = (int32_t)(input_shape->data[1]);
|
||||
chn = (int32_t)(attr[1]->shape->data[1]);
|
||||
is_input_8bits = attr[0]->dtype == I8 || attr[0]->dtype == U8;
|
||||
if (is2D)
|
||||
{
|
||||
height = 1;
|
||||
}
|
||||
|
||||
work_item_pixels = (float)height * 16;
|
||||
work_item_pixels = is_input_8bits ? 16 * (float)height : 8 * (float)height;
|
||||
|
||||
sum_x_tail = -work_item_pixels * input_zp * input_scale;
|
||||
sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
|
||||
|
|
@ -281,11 +288,11 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
|
|||
shaderParam.local_size[1] = 1;
|
||||
shaderParam.local_size[2] = 1;
|
||||
|
||||
if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
|
||||
if (is_input_8bits)
|
||||
{
|
||||
shaderParam.global_size[0] = (width + 255) / 256 * 16;
|
||||
}
|
||||
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
||||
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
|
||||
{
|
||||
shaderParam.global_size[0] = (width + 127) / 128 * 16;
|
||||
}
|
||||
|
|
@ -324,7 +331,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
||||
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
|
||||
{
|
||||
gpu_dp_inst_t uniSum_X_X2_8x2 = {{
|
||||
0x55555555, // TCfg
|
||||
|
|
@ -483,7 +490,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
|
|||
}
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
||||
if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
|
||||
{
|
||||
shaderParam.global_scale[0] = 8;
|
||||
}
|
||||
|
|
@ -610,6 +617,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U16, U16 ):
|
||||
case _PACK_SELECT_KEY( I16, I16 ):
|
||||
case _PACK_SELECT_KEY( I16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, F16 ):
|
||||
|
|
@ -838,8 +846,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
attr.size[0] = ((new_shape[0] + 255) / 256) * 4;
|
||||
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|
||||
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
|
||||
if (in0_dtype == I16 || in0_dtype == F16 || in0_dtype == U16)
|
||||
{
|
||||
attr.size[0] = ((new_shape[0] + 127) / 128) * 4;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -124,22 +124,23 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
|||
{0, 0, 0}
|
||||
};
|
||||
int8_t in0_fl = 0;
|
||||
int32_t inputZP0 = 0;
|
||||
float input_scale0 = 1.0f;
|
||||
int32_t inputZP1 = 0;
|
||||
float input_scale1 = 1.0f;
|
||||
int32_t input0_zp = 0;
|
||||
float input0_scale = 1.0f;
|
||||
int32_t input1_zp = 0;
|
||||
float input1_scale = 1.0f;
|
||||
float output_zp = 0;
|
||||
int8_t out_fl = 0;
|
||||
float outputZP = 0;
|
||||
|
||||
int32_t shift0 = 0;
|
||||
vsi_bool is_ge_fl = FALSE;
|
||||
int32_t shift0 = 0;
|
||||
vsi_bool is_ge_fl = FALSE;
|
||||
|
||||
vsi_bool is_2d_img = FALSE;
|
||||
uint32_t evis_version = 0;
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
uint32_t pack_key;
|
||||
vx_context ctx = vxGetContext((vx_reference)node);
|
||||
vx_context ctx = vxGetContext((vx_reference)node);
|
||||
vx_hardware_caps_params_t hw_param;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
|
@ -165,34 +166,30 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
|||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[2]->shape;
|
||||
inputZP0 = attr[0]->zero_point;
|
||||
input_scale0 = attr[0]->scale;
|
||||
inputZP1 = attr[1]->zero_point;
|
||||
input_scale1 = attr[1]->scale;
|
||||
outputZP = (float)attr[2]->zero_point;
|
||||
input_scale0 = input_scale0 / attr[2]->scale;
|
||||
input0_zp = attr[0]->zero_point;
|
||||
input0_scale = attr[0]->scale;
|
||||
input1_zp = attr[1]->zero_point;
|
||||
input1_scale = attr[1]->scale;
|
||||
output_zp = (float)attr[2]->zero_point;
|
||||
input0_scale = input0_scale / attr[2]->scale;
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP &&
|
||||
attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
in0_fl = (int8_t)attr[0]->dfp.fl;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
out_fl = (int8_t)attr[2]->dfp.fl;
|
||||
shift0 = in0_fl - out_fl;
|
||||
is_ge_fl = shift0 >= 0;
|
||||
}
|
||||
|
||||
shift0 = in0_fl - out_fl;
|
||||
|
||||
is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
|
||||
is_ge_fl = shift0 >= 0;
|
||||
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 ) \
|
||||
(IN0_TYPE | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26))
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version );
|
||||
pack_key = _PACK_SELECT_KEY(attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version);
|
||||
|
||||
if ( attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
|
||||
if (attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
|
||||
{
|
||||
gpu_param.global_scale[0] = 16;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
|
|
@ -204,7 +201,6 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
|||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
|
|
@ -215,97 +211,97 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
|||
|
||||
switch( pack_key )
|
||||
{
|
||||
case _PACK_SELECT_KEY( I8, I8, 1, 1, 2 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, 1, 1, 2 ):
|
||||
case _PACK_SELECT_KEY(I8, I8, 1, 1, 2):
|
||||
case _PACK_SELECT_KEY(I16, I16, 1, 1, 2):
|
||||
{
|
||||
gpu_dp_inst_t uniPreluDFPLo_2x8b = { {
|
||||
0x77777777, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x33221100, 0x77665544, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x30201000, 0x70605040, // BBin
|
||||
0x00004000, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniPreluDFPHi_2x8b = { {
|
||||
0x77777777, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0xbbaa9988, 0xffeeddcc, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x30201000, 0x70605040, // BBin
|
||||
0x00004000, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (attr[0]->dtype == I16)
|
||||
{
|
||||
gpu_dp_inst_t uniPreluDFPLo_2x8b = {{
|
||||
0x77777777, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x33221100, 0x77665544, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x30201000, 0x70605040, // BBin
|
||||
0x00004000, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniPreluDFPHi_2x8b = {{
|
||||
0x77777777, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0xbbaa9988, 0xffeeddcc, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x30201000, 0x70605040, // BBin
|
||||
0x00004000, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if ( attr[0]->dtype == I16 )
|
||||
{
|
||||
uniPreluDFPLo_2x8b.data[7] = 0x00003000;
|
||||
uniPreluDFPHi_2x8b.data[7] = 0x00003000;
|
||||
}
|
||||
|
||||
gpu_dp_inst_update_postshfit( &uniPreluDFPLo_2x8b, shift0 );
|
||||
gpu_dp_inst_update_postshfit( &uniPreluDFPHi_2x8b, shift0 );
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
uniPreluDFPLo_2x8b.data[7] = 0x00003000;
|
||||
uniPreluDFPHi_2x8b.data[7] = 0x00003000;
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( I8, I8, 1, 1, 1 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, 1, 1, 1 ):
|
||||
{
|
||||
gpu_dp_inst_t uniPreluInt8_2x8 = {{
|
||||
0x55555555, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0xb3a29180, 0xf7e6d5c4, // ABin
|
||||
0x66666666, // BSelt
|
||||
0x30201000, 0x70605040, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniPreluInt16_part0_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00510040, 0x00730062, // ABin
|
||||
0x06060606, // BSelt
|
||||
0x00100000, 0x00300020, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniPreluInt16_part1_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00510040, 0x00730062, // ABin
|
||||
0x06060606, // BSelt
|
||||
0x00500040, 0x00700060, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_update_postshfit( &uniPreluInt8_2x8, shift0 );
|
||||
gpu_dp_inst_update_postshfit( &uniPreluInt16_part0_4x4, shift0 );
|
||||
gpu_dp_inst_update_postshfit( &uniPreluInt16_part1_4x4, shift0 );
|
||||
gpu_dp_inst_update_postshfit(&uniPreluDFPLo_2x8b, shift0);
|
||||
gpu_dp_inst_update_postshfit(&uniPreluDFPHi_2x8b, shift0);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniPreluInt8_2x8", &uniPreluInt8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 1 ):
|
||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 2 ):
|
||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 1 ):
|
||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 2 ):
|
||||
status = vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(I8, I8, 1, 1, 1):
|
||||
case _PACK_SELECT_KEY(I16, I16, 1, 1, 1):
|
||||
{
|
||||
gpu_dp_inst_t uniPreluInt8_2x8 = { {
|
||||
0x55555555, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0xb3a29180, 0xf7e6d5c4, // ABin
|
||||
0x66666666, // BSelt
|
||||
0x30201000, 0x70605040, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniPreluInt16_part0_4x4 = { {
|
||||
0x05050505, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00510040, 0x00730062, // ABin
|
||||
0x06060606, // BSelt
|
||||
0x00100000, 0x00300020, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniPreluInt16_part1_4x4 = { {
|
||||
0x05050505, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00510040, 0x00730062, // ABin
|
||||
0x06060606, // BSelt
|
||||
0x00500040, 0x00700060, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_update_postshfit(&uniPreluInt8_2x8, shift0);
|
||||
gpu_dp_inst_update_postshfit(&uniPreluInt16_part0_4x4, shift0);
|
||||
gpu_dp_inst_update_postshfit(&uniPreluInt16_part1_4x4, shift0);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniPreluInt8_2x8", &uniPreluInt8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 1):
|
||||
case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 2):
|
||||
case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 1):
|
||||
case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 2):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
|
|
@ -446,15 +442,15 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"inputZP0", &inputZP0 );
|
||||
"input0_zp", &input0_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"input_scale0", &input_scale0 );
|
||||
"input0_scale", &input0_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"inputZP1", &inputZP1 );
|
||||
"input1_zp", &input1_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"input_scale1", &input_scale1 );
|
||||
"input1_scale", &input1_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"outputZP", &outputZP );
|
||||
"output_zp", &output_zp );
|
||||
if (attr[2]->dtype == F16)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
|
|
@ -58,53 +59,92 @@ typedef enum
|
|||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt"
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_1"
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_2"
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_3"
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_4"
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_5"
|
||||
|
||||
#define STR(a) #a
|
||||
// Add kernel hashtable here
|
||||
#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \
|
||||
(( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag))
|
||||
#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag, same_type ) \
|
||||
(( IN_DTYPE ) | ( OUT_DTYPE << 8) | (scale_flag << 16) | (same_type << 22))
|
||||
|
||||
#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \
|
||||
#define _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN, SAME_TYPE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \
|
||||
#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
|
||||
_PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, TRUE ), \
|
||||
_PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, FALSE )
|
||||
|
||||
#define _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP, SAME_TYPE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \
|
||||
#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
|
||||
_PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, TRUE ), \
|
||||
_PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, FALSE )
|
||||
|
||||
#define _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT, SAME_TYPE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
|
||||
#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
|
||||
_PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, TRUE ), \
|
||||
_PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, FALSE )
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, TRUE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_SAME_2x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
|
||||
#define PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, TRUE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_SAME_4x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \
|
||||
#define PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, TRUE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_SAME_8x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
|
||||
#define PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, TRUE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_SAME_3x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, FALSE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_2x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, FALSE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_4x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, FALSE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_8x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, FALSE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_3x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN, TRUE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_SAME_8x_upsample_align_corners"), \
|
||||
"resize_bilinear_align_corners" }
|
||||
|
|
@ -135,6 +175,10 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
|
|||
PACK_KERNEL_MAP_UP(F16, F16),
|
||||
PACK_KERNEL_MAP_UP(BF16, BF16),
|
||||
PACK_KERNEL_MAP_UP_OPT(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
|
||||
|
|
@ -672,18 +716,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
|
|||
};
|
||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
vsi_size_array_t * in_shape = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
vsi_size_array_t * in_shape = NULL;
|
||||
vsi_nn_kernel_dtype_e input_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e output_dtype = F16;
|
||||
uint32_t depth = 0;
|
||||
uint32_t in_width = 0;
|
||||
uint32_t in_height = 0;
|
||||
uint32_t out_width = 0;
|
||||
uint32_t out_height = 0;
|
||||
vsi_bool is_same_type = FALSE;
|
||||
vsi_bool is_2x_up_kernel = FALSE;
|
||||
vsi_bool is_3x_up_kernel = FALSE;
|
||||
vsi_bool is_4x_up_kernel = FALSE;
|
||||
vsi_bool is_8x_up_kernel = FALSE;
|
||||
float scale = 1.f;
|
||||
int32_t input_zp = 0;
|
||||
int32_t output_zp = 0;
|
||||
|
||||
VSI_UNREFERENCED(param_size);
|
||||
|
||||
|
|
@ -692,17 +741,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
|
|||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
in_shape = input_attr->shape;
|
||||
input_dtype = input_attr->dtype;
|
||||
out_shape = output_attr->shape;
|
||||
in_shape = input_attr->shape;
|
||||
input_dtype = input_attr->dtype;
|
||||
output_dtype = output_attr->dtype;
|
||||
|
||||
in_width = (uint32_t)(in_shape->data[0]);
|
||||
in_height = (uint32_t)(in_shape->data[1]);
|
||||
depth = (uint32_t)(in_shape->data[2]);
|
||||
out_width = (uint32_t)(out_shape->data[0]);
|
||||
out_height = (uint32_t)(out_shape->data[1]);
|
||||
scale = input_attr->scale;
|
||||
input_zp = input_attr->zero_point;
|
||||
scale /= output_attr->scale;
|
||||
output_zp = output_attr->zero_point;
|
||||
is_same_type = _is_same_quant(input_attr, output_attr);
|
||||
|
||||
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
|
||||
if ((U8 == input_dtype) && (output_dtype == U8))
|
||||
{
|
||||
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
|
||||
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
|
||||
|
|
@ -728,206 +783,303 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
|
|||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
|
||||
if (is_2x_up_kernel)
|
||||
if (is_2x_up_kernel || is_3x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
uint16_t M0 = 0;
|
||||
int32_t postShift = 0;
|
||||
uint32_t multAndoutZP[2] = { 0 };
|
||||
gpu_dp_inst_t uniU8PostProcess_2x8 = { {
|
||||
0xdddddddd, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x13121110, 0x17161514, // ABin
|
||||
0x11111111, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_3x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
|
||||
0x15515515, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x21210110, 0x03323202, // ABin
|
||||
0x2aa2aa2a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
|
||||
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
|
||||
0x05155155, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x54044343, 0x00650554, // ABin
|
||||
0x0a2aa2aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
|
||||
0x55551155, // TCfg
|
||||
0x50501050, // ASelt
|
||||
0x01011010, 0x21212121, // ABin
|
||||
0xaaaa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
|
||||
0x11555511, // TCfg
|
||||
0x10505010, // ASelt
|
||||
0x32320202, 0x03033232, // ABin
|
||||
0x22aaaa22, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
|
||||
0x55115555, // TCfg
|
||||
0x50105050, // ASelt
|
||||
0x43434343, 0x54540404, // ABin
|
||||
0xaa22aaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
|
||||
0x00551155, // TCfg
|
||||
0x00501050, // ASelt
|
||||
0x05055454, 0x00006565, // ABin
|
||||
0x00aa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
if (is_2x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize2xUp_0_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize2xUp_1_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_4x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
if (!is_same_type)
|
||||
{
|
||||
float f2i_radio = 16.0f;
|
||||
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
|
||||
multAndoutZP[0] = (uint32_t)(M0);
|
||||
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_8x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
|
||||
uniResize2xUp_0_4x8.data[7] = 0x00000700;
|
||||
uniResize2xUp_1_4x8.data[7] = 0x00000700;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
|
||||
&uniU8PostProcess_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
else if (is_3x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize3xUp_l00_2x8 = { {
|
||||
0x15515515, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x21210110, 0x03323202, // ABin
|
||||
0x2aa2aa2a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
|
||||
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize3xUp_l01_2x8 = { {
|
||||
0x05155155, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x54044343, 0x00650554, // ABin
|
||||
0x0a2aa2aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize3xUp_l10_4x4 = { {
|
||||
0x55551155, // TCfg
|
||||
0x50501050, // ASelt
|
||||
0x01011010, 0x21212121, // ABin
|
||||
0xaaaa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize3xUp_l11_4x4 = { {
|
||||
0x11555511, // TCfg
|
||||
0x10505010, // ASelt
|
||||
0x32320202, 0x03033232, // ABin
|
||||
0x22aaaa22, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize3xUp_l12_4x4 = { {
|
||||
0x55115555, // TCfg
|
||||
0x50105050, // ASelt
|
||||
0x43434343, 0x54540404, // ABin
|
||||
0xaa22aaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize3xUp_l13_4x4 = { {
|
||||
0x00551155, // TCfg
|
||||
0x00501050, // ASelt
|
||||
0x05055454, 0x00006565, // ABin
|
||||
0x00aa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (!is_same_type)
|
||||
{
|
||||
float f2i_radio = 256.0f;
|
||||
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
|
||||
multAndoutZP[0] = (uint32_t)(M0);
|
||||
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
|
||||
|
||||
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
|
||||
uniResize3xUp_l00_2x8.data[7] = 0x00000608;
|
||||
uniResize3xUp_l01_2x8.data[7] = 0x00000608;
|
||||
uniResize3xUp_l10_4x4.data[7] = 0x00000607;
|
||||
uniResize3xUp_l11_4x4.data[7] = 0x00000607;
|
||||
uniResize3xUp_l12_4x4.data[7] = 0x00000607;
|
||||
uniResize3xUp_l13_4x4.data[7] = 0x00000607;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
|
||||
&uniU8PostProcess_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
else if (is_4x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize4xUp_l00_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize4xUp_l01_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize4xUp_l10_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize4xUp_l11_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (!is_same_type)
|
||||
{
|
||||
float f2i_radio = 64.0f;
|
||||
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
|
||||
multAndoutZP[0] = (uint32_t)(M0);
|
||||
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
|
||||
|
||||
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
|
||||
uniResize4xUp_l00_4x8.data[7] = 0x00000400;
|
||||
uniResize4xUp_l01_4x8.data[7] = 0x00000400;
|
||||
uniResize4xUp_l10_4x8.data[7] = 0x00000400;
|
||||
uniResize4xUp_l11_4x8.data[7] = 0x00000400;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
|
||||
&uniU8PostProcess_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
else if (is_8x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize8xUp_l00_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize8xUp_l01_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize8xUp_l10_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize8xUp_l11_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize8xUp_l20_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize8xUp_l21_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize8xUp_l30_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniResize8xUp_l31_4x8 = { {
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (!is_same_type)
|
||||
{
|
||||
float f2i_radio = 256.0f;
|
||||
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
|
||||
multAndoutZP[0] = (uint32_t)(M0);
|
||||
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
|
||||
|
||||
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
|
||||
uniResize8xUp_l00_4x8.data[7] = 0x00000700;
|
||||
uniResize8xUp_l01_4x8.data[7] = 0x00000700;
|
||||
uniResize8xUp_l10_4x8.data[7] = 0x00000700;
|
||||
uniResize8xUp_l11_4x8.data[7] = 0x00000700;
|
||||
uniResize8xUp_l20_4x8.data[7] = 0x00000700;
|
||||
uniResize8xUp_l21_4x8.data[7] = 0x00000700;
|
||||
uniResize8xUp_l30_4x8.data[7] = 0x00000700;
|
||||
uniResize8xUp_l31_4x8.data[7] = 0x00000700;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
|
||||
&uniU8PostProcess_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -1193,22 +1345,22 @@ static vsi_status _query_kernel
|
|||
|
||||
if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
|
||||
{
|
||||
if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
|
||||
if ((!align_corners) && (half_pixel_centers) && is_2x_upsample)
|
||||
{
|
||||
scale_flag = UP_2X_HALF;
|
||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||
}
|
||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
|
||||
else if ((!align_corners) && (half_pixel_centers) && is_3x_upsample)
|
||||
{
|
||||
scale_flag = UP_3X_HALF;
|
||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||
}
|
||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
|
||||
else if ((!align_corners) && (half_pixel_centers) && is_4x_upsample)
|
||||
{
|
||||
scale_flag = UP_4X_HALF;
|
||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||
}
|
||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
|
||||
else if ((!align_corners) && (half_pixel_centers) && is_8x_upsample)
|
||||
{
|
||||
scale_flag = UP_8X_HALF;
|
||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||
|
|
@ -1232,7 +1384,7 @@ static vsi_status _query_kernel
|
|||
scale_flag = DOWN;
|
||||
}
|
||||
|
||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
|
||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
|
||||
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if( kernel_map[i].key == key )
|
||||
|
|
@ -1244,7 +1396,7 @@ static vsi_status _query_kernel
|
|||
if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
|
||||
{
|
||||
scale_flag = UP;
|
||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
|
||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
|
||||
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if( kernel_map[i].key == key )
|
||||
|
|
@ -1257,7 +1409,7 @@ static vsi_status _query_kernel
|
|||
if ((UP == scale_flag) && (i >= kernel_map_size))
|
||||
{
|
||||
scale_flag = DOWN;
|
||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
|
||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
|
||||
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if( kernel_map[i].key == key )
|
||||
|
|
@ -1433,7 +1585,7 @@ static vsi_bool _is_image_width_lt16
|
|||
size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
|
||||
vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;
|
||||
|
||||
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
|
||||
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
|
@ -1468,7 +1620,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
|
||||
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
|
||||
vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]);
|
||||
vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
|
||||
vsi_bool is_evis2 = \
|
||||
(vsi_bool)(((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_2);
|
||||
vsi_bool is_run_opt_kernel = FALSE;
|
||||
vsi_nn_tensor_t* scale = NULL;
|
||||
int32_t pad_left = half_pixel_centers ? 1 : 0;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,744 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
B---batch
|
||||
N---num_heads
|
||||
S---sequence length
|
||||
H---head size
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
LAYOUT_NONE,
|
||||
LAYOUT_BNHS,
|
||||
LAYOUT_BNH1,
|
||||
LAYOUT_BSNH,
|
||||
LAYOUT_BNSH,
|
||||
} _internal_rope_layout_e;
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define STR(a) #a
|
||||
#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT, INTERLEAVED ) \
|
||||
((IN0_DTYPE) | (IN1_DTYPE << 8) | (OUT_DTYPE << 16) | (LAYOUT << 24) | (INTERLEAVED << 28))
|
||||
#define PACK_KERNEL_BNHS_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNHS, 0 ), \
|
||||
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnhs"), \
|
||||
"rope_0" }
|
||||
#define PACK_KERNEL_BNH1_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNH1, 0 ), \
|
||||
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnh1"), \
|
||||
"rope_1" }
|
||||
|
||||
#define PACK_KERNEL_BSNH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BSNH, 1 ), \
|
||||
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bsnh"), \
|
||||
"rope_2" }
|
||||
|
||||
#define PACK_KERNEL_BNSH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNSH, 1 ), \
|
||||
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnsh"), \
|
||||
"rope_3" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
#define PACK_KERNEL_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
|
||||
PACK_KERNEL_BNHS_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
|
||||
PACK_KERNEL_BNH1_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
|
||||
PACK_KERNEL_BSNH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
|
||||
PACK_KERNEL_BNSH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE),
|
||||
|
||||
static const _kernel_map_type _rope_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( BF16, BF16, BF16)
|
||||
PACK_KERNEL_MAP( F16, F16, F16 )
|
||||
PACK_KERNEL_MAP( I16, I16, I16 )
|
||||
PACK_KERNEL_MAP( I16, F16, I16 )
|
||||
PACK_KERNEL_MAP( I16, I16, I8 )
|
||||
PACK_KERNEL_MAP( I16, F16, I8 )
|
||||
PACK_KERNEL_MAP( I16, I16, U8 )
|
||||
PACK_KERNEL_MAP( I16, F16, U8 )
|
||||
PACK_KERNEL_MAP( U16, U16, U16 )
|
||||
PACK_KERNEL_MAP( U16, F16, U16 )
|
||||
PACK_KERNEL_MAP( I8, I8, I8 )
|
||||
PACK_KERNEL_MAP( I8, F16, I8 )
|
||||
PACK_KERNEL_MAP( U8, U8, U8 )
|
||||
PACK_KERNEL_MAP( U8, F16, U8 )
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _rope_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def )
|
||||
#define SCALAR_AXIS (4)
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_rope_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t* out_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t* in0_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t* in1_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t* in2_attr = NULL;
|
||||
vsi_size_array_t* in_shape = NULL;
|
||||
vsi_nn_kernel_dtype_e in0_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e in1_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e in2_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e out_dtype = F16;
|
||||
float in0_scale = 1.0f;
|
||||
float in1_scale = 1.0f;
|
||||
float in2_scale = 1.0f;
|
||||
float output_scale = 1.0f;
|
||||
float output_zp = 0;
|
||||
int32_t in0_zp = 0;
|
||||
int32_t cos_zp = 0;
|
||||
int32_t sin_zp = 0;
|
||||
int32_t p = 0;
|
||||
int32_t axis = 0;
|
||||
int32_t interleaved = 0;
|
||||
int32_t half_head_size = 1;
|
||||
vsi_size_t shape[3] = {1};
|
||||
uint32_t pack_key = 0;
|
||||
|
||||
VSI_UNREFERENCED(node);
|
||||
VSI_UNREFERENCED(param);
|
||||
VSI_UNREFERENCED(param_size);
|
||||
// Add initializer
|
||||
|
||||
in0_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
|
||||
CHECK_PTR_FAIL_GOTO(in0_attr, "Create tensor attr buffer fail.", final);
|
||||
in1_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
|
||||
CHECK_PTR_FAIL_GOTO(in1_attr, "Create tensor attr buffer fail.", final);
|
||||
in2_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
|
||||
CHECK_PTR_FAIL_GOTO(in2_attr, "Create tensor attr buffer fail.", final);
|
||||
out_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[3]);
|
||||
CHECK_PTR_FAIL_GOTO(out_attr, "Create tensor attr buffer fail.", final);
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &p);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
axis = p & 0xFFFF;
|
||||
interleaved = (p >> 16) & 0xFFFF;
|
||||
|
||||
in_shape = in0_attr->shape;
|
||||
in0_dtype = in0_attr->dtype;
|
||||
in1_dtype = in1_attr->dtype;
|
||||
in2_dtype = in2_attr->dtype;
|
||||
out_dtype = out_attr->dtype;
|
||||
|
||||
in0_scale = in0_attr->scale;
|
||||
in1_scale = in1_attr->scale;
|
||||
in2_scale = in2_attr->scale;
|
||||
in0_zp = -in0_attr->zero_point;
|
||||
cos_zp = -in1_attr->zero_point;
|
||||
sin_zp = -in2_attr->zero_point;
|
||||
output_scale = out_attr->scale;
|
||||
output_zp = (float)out_attr->zero_point;
|
||||
|
||||
half_head_size = (int32_t)(in_shape->data[axis] / 2);
|
||||
shape[0] = in_shape->data[0];
|
||||
shape[1] = in_shape->data[1];
|
||||
shape[2] = in_shape->data[2];
|
||||
shape[axis] = half_head_size;
|
||||
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = gpu_align_p2((shape[0] + \
|
||||
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = shape[1];
|
||||
gpu_param.global_size[2] = shape[2];
|
||||
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \
|
||||
((IN0_TYPE) | (IN1_TYPE << 8) | (IN2_TYPE << 16) | (OUT_TYPE << 24))
|
||||
|
||||
pack_key = _PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype);
|
||||
switch (pack_key)
|
||||
{
|
||||
case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = { {
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = { {
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = { {
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (interleaved && axis == 0)
|
||||
{
|
||||
uniExtractOddData_2x8.data[1] = 0x10101010;
|
||||
uniExtractOddData_2x8.data[2] = 0x03030101;
|
||||
uniExtractOddData_2x8.data[3] = 0x07070505;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node,
|
||||
"half_head_size", &half_head_size);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(I16, I16, I16, I16):
|
||||
case _PACK_SELECT_KEY(I16, F16, F16, I16):
|
||||
case _PACK_SELECT_KEY(I16, I16, I16, I8):
|
||||
case _PACK_SELECT_KEY(I16, F16, F16, I8):
|
||||
case _PACK_SELECT_KEY(I16, I16, I16, U8):
|
||||
case _PACK_SELECT_KEY(I16, F16, F16, U8):
|
||||
case _PACK_SELECT_KEY(F16, F16, F16, F16):
|
||||
{
|
||||
float scale0 = in0_scale * in1_scale / output_scale;
|
||||
float scale1 = in0_scale* in2_scale / output_scale;
|
||||
gpu_dp_inst_t uniExtractHalf8_2x8 = { {
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractInteger_2x8 = { {
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniATimesB_0_4x4 = { {
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x01010101, // BSelt
|
||||
0x00010000, 0x00030002, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniATimesB_1_4x4 = { {
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x01010101, // BSelt
|
||||
0x00050004, 0x00070006, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAEvenTimesB_0_4x4 = { {
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00020000, 0x00060004, // ABin
|
||||
0x01010101, // BSelt
|
||||
0x00010000, 0x00030002, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAEvenTimesB_1_4x4 = { {
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00020000, 0x00060004, // ABin
|
||||
0x01010101, // BSelt
|
||||
0x00050004, 0x00070006, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAOddTimesB_0_4x4 = { {
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00030001, 0x00070005, // ABin
|
||||
0x01010101, // BSelt
|
||||
0x00010000, 0x00030002, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAOddTimesB_1_4x4 = { {
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00030001, 0x00070005, // ABin
|
||||
0x01010101, // BSelt
|
||||
0x00050004, 0x00070006, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (interleaved && axis == 0)
|
||||
{
|
||||
uniExtractHalf8_2x8.data[1] = 0x10101010;
|
||||
uniExtractHalf8_2x8.data[2] = 0x02020000;
|
||||
uniExtractHalf8_2x8.data[3] = 0x06060404;
|
||||
uniExtractInteger_2x8.data[1] = 0x10101010;
|
||||
uniExtractInteger_2x8.data[2] = 0x01010000;
|
||||
uniExtractInteger_2x8.data[3] = 0x03030202;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniAEvenTimesB_0_4x4", &uniAEvenTimesB_0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniAEvenTimesB_1_4x4", &uniAEvenTimesB_1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniAOddTimesB_0_4x4", &uniAOddTimesB_0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniAOddTimesB_1_4x4", &uniAOddTimesB_1_4x4);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniATimesB_0_4x4", &uniATimesB_0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniATimesB_1_4x4", &uniATimesB_1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"half_head_size", &half_head_size);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"scale0", &scale0);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"scale1", &scale1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"output_zp", &output_zp);
|
||||
if (out_dtype == F16)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniExtract8Data_2x8", &uniExtractHalf8_2x8);
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniExtract8Data_2x8", &uniExtractInteger_2x8);
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY(I8, I8, I8, I8):
|
||||
case _PACK_SELECT_KEY(U8, U8, U8, U8):
|
||||
case _PACK_SELECT_KEY(U16, U16, U16, U16):
|
||||
case _PACK_SELECT_KEY(I8, F16, F16, I8):
|
||||
case _PACK_SELECT_KEY(U8, F16, F16, U8):
|
||||
case _PACK_SELECT_KEY(U16, F16, F16, U16):
|
||||
{
|
||||
float scale0 = in0_scale * in1_scale / output_scale;
|
||||
float scale1 = in0_scale* in2_scale / output_scale;
|
||||
gpu_dp_inst_t uniExtractInteger_2x8 = { {
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAMinusZp_0_4x4 = { {
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAMinusZp_1_4x4 = { {
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAEvenMinusZp_4x4 = { {
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00020000, 0x00060004, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAOddMinusZp_4x4 = { {
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00030001, 0x00070005, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (interleaved && axis == 0)
|
||||
{
|
||||
uniExtractInteger_2x8.data[1] = 0x10101010;
|
||||
uniExtractInteger_2x8.data[2] = 0x01010000;
|
||||
uniExtractInteger_2x8.data[3] = 0x03030202;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniAEvenMinusZp_4x4", &uniAEvenMinusZp_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniAOddMinusZp_4x4", &uniAOddMinusZp_4x4);
|
||||
}
|
||||
else
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node,
|
||||
"half_head_size", &half_head_size);
|
||||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniAMinusZp_0_4x4", &uniAMinusZp_0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniAMinusZp_1_4x4", &uniAMinusZp_1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"scale0", &scale0);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"scale1", &scale1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"output_zp", &output_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"in0_zp", &in0_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"cos_zp", &cos_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"sin_zp", &sin_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node,
|
||||
"uniExtract8Data_2x8", &uniExtractInteger_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
|
||||
final:
|
||||
if (in0_attr) vsi_nn_kernel_tensor_attr_release(&in0_attr);
|
||||
if (in1_attr) vsi_nn_kernel_tensor_attr_release(&in1_attr);
|
||||
if (in2_attr) vsi_nn_kernel_tensor_attr_release(&in2_attr);
|
||||
if (out_attr) vsi_nn_kernel_tensor_attr_release(&out_attr);
|
||||
return status;
|
||||
} /* _rope_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t axis,
|
||||
int32_t interleaved,
|
||||
_internal_rope_layout_e *layout
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e in2_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
int32_t in0_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
int32_t in1_zp = vsi_nn_get_tensor_zero_point(inputs[1]);
|
||||
int32_t in2_zp = vsi_nn_get_tensor_zero_point(inputs[2]);
|
||||
const _kernel_map_type * kernel_map = _rope_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _rope_kernel_map );
|
||||
vx_param_description_t * param_def = _rope_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _rope_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
/*only support symmetric int16*/
|
||||
if ( ( (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I16) ||
|
||||
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == I16) ||
|
||||
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == I8) ||
|
||||
(in0_dtype == I16 && in1_dtype == I16 && out_dtype == I8) ||
|
||||
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == U8) ||
|
||||
(in0_dtype == I16 && in1_dtype == I16 && out_dtype == U8) ) &&
|
||||
(in0_zp != 0 || in1_zp != 0 || in2_zp != 0))
|
||||
{
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
if (axis == 1 && inputs[0]->attr.size[0] == inputs[1]->attr.size[0] &&
|
||||
in1_dtype == in2_dtype)
|
||||
{
|
||||
if (inputs[0]->attr.size[0] == 1)
|
||||
{
|
||||
*layout = LAYOUT_BNH1;
|
||||
}
|
||||
else
|
||||
{
|
||||
*layout = LAYOUT_BNHS;
|
||||
}
|
||||
}
|
||||
else if (axis == 0 && in1_dtype == in2_dtype)
|
||||
{
|
||||
if (inputs[0]->attr.size[2] == inputs[1]->attr.size[2] &&
|
||||
inputs[1]->attr.size[1] == 1)
|
||||
{
|
||||
*layout = LAYOUT_BSNH;
|
||||
}
|
||||
else
|
||||
{
|
||||
*layout = LAYOUT_BNSH;
|
||||
}
|
||||
}
|
||||
|
||||
key = ROPE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, *layout, interleaved);
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t axis = 0;
|
||||
int32_t i = 0;
|
||||
int32_t interleaved = 0;
|
||||
int32_t param = 0;
|
||||
vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||
vsi_nn_tensor_t* rs_tensors[4] = { NULL };
|
||||
vsi_nn_tensor_t* reshape_tensors[4] = { NULL };
|
||||
_internal_rope_layout_e layout = LAYOUT_NONE;
|
||||
|
||||
VSI_UNREFERENCED(params);
|
||||
|
||||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
|
||||
|
||||
// Check if gpu can support the size
|
||||
if ( !vsi_nn_kernel_gpu_check_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, axis, interleaved, &layout );
|
||||
if (outputs[0]->attr.size[0] == 1 || layout == LAYOUT_BSNH)
|
||||
{
|
||||
memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
|
||||
memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
|
||||
memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
|
||||
|
||||
if (outputs[0]->attr.size[0] == 1)
|
||||
{
|
||||
for (i = 1; i < 3; i++)
|
||||
{
|
||||
shape[0][i - 1] = shape[0][i];
|
||||
shape[1][i - 1] = shape[1][i];
|
||||
shape[2][i - 1] = shape[2][i];
|
||||
}
|
||||
shape[0][2] = 1;
|
||||
shape[1][2] = 1;
|
||||
shape[2][2] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
int32_t j = 0;
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
if (shape[1][i] != 1)
|
||||
{
|
||||
shape[1][j] = shape[1][i];
|
||||
j ++;
|
||||
}
|
||||
}
|
||||
for (; j < 3; j++)
|
||||
{
|
||||
shape[1][j] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
rs_tensors[0] = vsi_nn_reshape_tensor(graph,
|
||||
inputs[0], shape[0], inputs[0]->attr.dim_num);
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor(graph,
|
||||
inputs[1], shape[1], inputs[1]->attr.dim_num);
|
||||
rs_tensors[2] = vsi_nn_reshape_tensor(graph,
|
||||
inputs[2], shape[1], inputs[2]->attr.dim_num);
|
||||
rs_tensors[3] = vsi_nn_reshape_tensor(graph,
|
||||
outputs[0], shape[2], outputs[0]->attr.dim_num);
|
||||
|
||||
if (outputs[0]->attr.size[0] == 1 && axis > 0)
|
||||
{
|
||||
axis--;
|
||||
}
|
||||
reshape_tensors[0] = rs_tensors[0];
|
||||
reshape_tensors[1] = rs_tensors[1];
|
||||
reshape_tensors[2] = rs_tensors[2];
|
||||
reshape_tensors[3] = rs_tensors[3];
|
||||
}
|
||||
else
|
||||
{
|
||||
reshape_tensors[0] = inputs[0];
|
||||
reshape_tensors[1] = inputs[1];
|
||||
reshape_tensors[2] = inputs[2];
|
||||
reshape_tensors[3] = outputs[0];
|
||||
}
|
||||
|
||||
param = (interleaved << 16) | axis;
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
|
||||
reshape_tensors, input_num, &reshape_tensors[3], output_num );
|
||||
/* Pass parameters to node. */
|
||||
node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, ¶m);
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release(&node_params[SCALAR_AXIS]);
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(rs_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( rope, _setup )
|
||||
|
||||
|
|
@ -186,18 +186,26 @@ static const _kernel_map_type scatter_nd_update_special_ref_map[] =
|
|||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4)
|
||||
};
|
||||
|
||||
static const _kernel_map_type scatter_nd_update_special_update_map[] =
|
||||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4)
|
||||
};
|
||||
|
||||
static const _kernel_map_type scatter_nd_update_special_copy_map[] =
|
||||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
|
||||
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -563,6 +571,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
|
|||
{
|
||||
case _PACK_SELECT_KEY( I8, I8 ):
|
||||
case _PACK_SELECT_KEY( U8, U8 ):
|
||||
case _PACK_SELECT_KEY( I16, I16 ):
|
||||
case _PACK_SELECT_KEY( U16, U16 ):
|
||||
{
|
||||
uint16_t M0 = 0;
|
||||
int32_t postShift0 = 0;
|
||||
|
|
@ -605,6 +615,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, F16 ):
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -759,6 +771,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
|
|||
{
|
||||
case _PACK_SELECT_KEY( I8, I8 ):
|
||||
case _PACK_SELECT_KEY( U8, U8 ):
|
||||
case _PACK_SELECT_KEY( I16, I16 ):
|
||||
case _PACK_SELECT_KEY( U16, U16 ):
|
||||
{
|
||||
uint16_t M1 = 0;
|
||||
int32_t postShift1 = 0;
|
||||
|
|
@ -801,6 +815,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, F16 ):
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -1597,6 +1613,19 @@ static vsi_status _query_kernel_special
|
|||
status |= VSI_FAILURE;
|
||||
}
|
||||
|
||||
if (input0_dtype == F16)
|
||||
{
|
||||
input0_dtype = U16;
|
||||
}
|
||||
if (input2_dtype == F16)
|
||||
{
|
||||
input2_dtype = U16;
|
||||
}
|
||||
if (output_dtype == F16)
|
||||
{
|
||||
output_dtype = U16;
|
||||
}
|
||||
|
||||
key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
|
|
@ -591,7 +592,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
VSI_UNREFERENCED(input_num);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
#if (VX_ACTIVATION_EXT_SUPPORT)
|
||||
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
|
||||
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -548,16 +548,16 @@ static vsi_status _gpu_register
|
|||
vsi_status status;
|
||||
vx_kernel_description_t* info;
|
||||
vx_kernel obj;
|
||||
vsi_nn_context_t context;
|
||||
vx_program program = NULL;
|
||||
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
|
||||
vsi_nn_runtime_option_t* options;
|
||||
options = ((vsi_nn_graph_prv_t*)graph)->options;
|
||||
|
||||
#define MAX_BUILDPROGRAM_LEN 1024
|
||||
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
|
||||
size_t cost_bytes = 0;
|
||||
|
||||
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
|
||||
context = graph->ctx;
|
||||
|
||||
status = VSI_FAILURE;
|
||||
info = &(kernel->info);
|
||||
|
|
@ -579,21 +579,21 @@ static vsi_status _gpu_register
|
|||
return status;
|
||||
}
|
||||
|
||||
if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
|
||||
if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
|
||||
{
|
||||
// set default evis version is 2
|
||||
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
|
||||
{
|
||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
|
||||
context->config.use_40bits_va );
|
||||
options->config.use_40bits_va );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
|
||||
context->config.evis.ver, context->config.use_40bits_va );
|
||||
options->config.evis.ver, options->config.use_40bits_va );
|
||||
}
|
||||
// Pack build option
|
||||
if( kernel->gpu.sources[active_fmt].build_option.data )
|
||||
|
|
@ -655,16 +655,16 @@ static vsi_status _gpu_register_ext
|
|||
vsi_status status;
|
||||
vx_kernel_description_t* info;
|
||||
vx_kernel obj;
|
||||
vsi_nn_context_t context;
|
||||
vx_program program = NULL;
|
||||
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
|
||||
vsi_nn_runtime_option_t* options;
|
||||
options = ((vsi_nn_graph_prv_t*)graph)->options;
|
||||
|
||||
#define MAX_BUILDPROGRAM_LEN 1024
|
||||
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
|
||||
size_t cost_bytes = 0;
|
||||
|
||||
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
|
||||
context = graph->ctx;
|
||||
|
||||
status = VSI_FAILURE;
|
||||
info = &(kernel->info);
|
||||
|
|
@ -686,21 +686,21 @@ static vsi_status _gpu_register_ext
|
|||
return status;
|
||||
}
|
||||
|
||||
if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
|
||||
if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
|
||||
{
|
||||
// set default evis version is 2
|
||||
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
|
||||
{
|
||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
|
||||
context->config.use_40bits_va );
|
||||
options->config.use_40bits_va );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
|
||||
context->config.evis.ver, context->config.use_40bits_va );
|
||||
options->config.evis.ver, options->config.use_40bits_va );
|
||||
}
|
||||
// Pack build option
|
||||
if( kernel->gpu.sources[active_fmt].build_option.data )
|
||||
|
|
@ -1258,7 +1258,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
|
|||
}
|
||||
/* Skip evis if not support */
|
||||
if( type == VSI_NN_KERNEL_TYPE_EVIS
|
||||
&& graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE )
|
||||
&& ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_NONE )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
|
@ -1677,7 +1677,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
|
|||
int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
|
||||
|
||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||
if ( graph->ctx->config.subGroupSize == 0 )
|
||||
if ( ((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize == 0 )
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -162,15 +162,11 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(pow)
|
|||
#if (VX_TENSOR_GATHER_API_SUPPORT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
|
||||
#endif
|
||||
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
|
||||
#endif
|
||||
#if (VX_TENSOR_TILE_API_SUPPORT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
|
||||
#endif
|
||||
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm)
|
||||
#endif
|
||||
#if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
|
||||
#endif
|
||||
|
|
@ -184,6 +180,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
|
|||
#if (VX_BITCAST_VX_SUPPORT)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
|
||||
#endif
|
||||
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(group_norm)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(instance_norm)
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -0,0 +1,89 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2021 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
#if VX_GROUP_NORMALIZATION_VX_SUPPORT
|
||||
#define REGISTER_GROUP_NORM_OPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
); \
|
||||
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
)
|
||||
|
||||
REGISTER_GROUP_NORM_OPENVX_KERNEL(group_norm)
|
||||
{
|
||||
vx_node node = NULL;
|
||||
float eps = vsi_nn_kernel_param_get_float32(params, "eps");
|
||||
int32_t group_num = vsi_nn_kernel_param_get_int32(params, "group_num");
|
||||
vx_tensor inputs_tensor[3] = { NULL };
|
||||
vx_tensor output_tensor = NULL;
|
||||
|
||||
inputs_tensor[0] = inputs[0]->t;
|
||||
inputs_tensor[1] = inputs[1]->t;
|
||||
inputs_tensor[2] = inputs[2]->t;
|
||||
output_tensor = outputs[0]->t;
|
||||
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(kernel);
|
||||
|
||||
if (graph->ctx->config.support_ffd ||
|
||||
graph->ctx->config.support_stream_processor)
|
||||
{
|
||||
node = vxGroupNormalizationLayer(
|
||||
graph->g,
|
||||
eps,
|
||||
group_num,
|
||||
inputs_tensor,
|
||||
(vx_uint32)input_num,
|
||||
output_tensor
|
||||
);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* group_norm() */
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2021 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
#if VX_INSTANCE_NORMALIZATION_VX_SUPPORT
|
||||
#define REGISTER_INSTANCE_NORM_OPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
); \
|
||||
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
)
|
||||
|
||||
REGISTER_INSTANCE_NORM_OPENVX_KERNEL(instance_norm)
|
||||
{
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
float eps = vsi_nn_kernel_param_get_float32(params, "eps");
|
||||
vx_tensor inputs_tensor[3] = { NULL };
|
||||
vx_tensor output_tensor = NULL;
|
||||
|
||||
inputs_tensor[0] = inputs[0]->t;
|
||||
inputs_tensor[1] = inputs[1]->t;
|
||||
inputs_tensor[2] = inputs[2]->t;
|
||||
output_tensor = outputs[0]->t;
|
||||
|
||||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(kernel);
|
||||
|
||||
if (graph->ctx->config.support_ffd ||
|
||||
graph->ctx->config.support_stream_processor)
|
||||
{
|
||||
node = vxInstanceNormalizationLayer(
|
||||
graph->g,
|
||||
eps,
|
||||
inputs_tensor,
|
||||
(vx_uint32)input_num,
|
||||
output_tensor
|
||||
);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* instance_norm() */
|
||||
|
||||
#endif
|
||||
|
|
@ -30,7 +30,7 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
|
||||
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT)
|
||||
#define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
|
|
@ -71,14 +71,20 @@ REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm )
|
|||
inputs_tensor[2] = inputs[2]->t;
|
||||
output_tensor = outputs[0]->t;
|
||||
|
||||
node = vxLayerNormalizationLayer(
|
||||
graph->g,
|
||||
eps,
|
||||
axis,
|
||||
inputs_tensor,
|
||||
(uint32_t)input_num,
|
||||
output_tensor
|
||||
#if !defined(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) || !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
|
||||
if (graph->ctx->config.support_ffd ||
|
||||
graph->ctx->config.support_stream_processor)
|
||||
#endif
|
||||
{
|
||||
node = vxLayerNormalizationLayer(
|
||||
graph->g,
|
||||
eps,
|
||||
axis,
|
||||
inputs_tensor,
|
||||
(uint32_t)input_num,
|
||||
output_tensor
|
||||
);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* layer_norm() */
|
||||
|
|
|
|||
|
|
@ -89,9 +89,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
|
|||
if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
|
||||
{
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
|
||||
memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
|
||||
memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
|
||||
attr.vtl = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
attr.is_const = FALSE;
|
||||
|
||||
convert_tensor = vsi_nn_CreateTensor(graph, &attr);
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||
#if (VX_RELATIONAL_OPS_VX_SUPPORT)
|
||||
|
||||
#define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
|
|
@ -68,12 +68,25 @@ REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops )
|
|||
VSI_UNREFERENCED(kernel);
|
||||
VSI_UNREFERENCED(output_num);
|
||||
|
||||
node = vxRelationalLayer(graph->g,
|
||||
operation,
|
||||
inputs_tensor,
|
||||
(uint32_t)input_num,
|
||||
outputs[0]->t
|
||||
);
|
||||
#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||
if (vsi_nn_is_broadcast_operaton(inputs, input_num, outputs[0]))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
|
||||
if (graph->ctx->config.support_stream_processor)
|
||||
#endif
|
||||
{
|
||||
node = vxRelationalLayer(
|
||||
graph->g,
|
||||
operation,
|
||||
inputs_tensor,
|
||||
(uint32_t)input_num,
|
||||
outputs[0]->t
|
||||
);
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* relational_ops() */
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@
|
|||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
|
|
@ -66,7 +67,7 @@ REGISTER_SWISH_OPENVX_KERNEL( swish )
|
|||
VSI_UNREFERENCED(output_num);
|
||||
VSI_UNREFERENCED(input_num);
|
||||
|
||||
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
|
||||
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
|
||||
{
|
||||
swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");
|
||||
|
||||
|
|
|
|||
|
|
@ -67,8 +67,8 @@ __kernel void cumsum_F32toF32_axis2(
|
|||
}
|
||||
}
|
||||
|
||||
#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \
|
||||
__kernel void cumsum_##name##toU8_axis2( \
|
||||
#define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||
__kernel void cumsum_##name##_axis2( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
|
|
@ -87,19 +87,19 @@ __kernel void cumsum_##name##toU8_axis2( \
|
|||
int4 coord_out = coord; \
|
||||
\
|
||||
src_type sum = (src_type)(0); \
|
||||
uint4 dst = (uint4)(0); \
|
||||
dst_type dst = (dst_type)(0); \
|
||||
int tmp_zp = convert_int_rte(output_zp); \
|
||||
dst.x = convert_uint_sat(tmp_zp); \
|
||||
dst.x = convert_dtype(tmp_zp); \
|
||||
\
|
||||
float cnt = 0.0f; \
|
||||
\
|
||||
if(exclusive && rev) \
|
||||
{ \
|
||||
coord_out.z = channel - 1; \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
image_write(output, coord_out, dst); \
|
||||
for(coord.z = channel - 1; coord.z > 0; coord.z--) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
coord_out.z--; \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
|
|
@ -107,17 +107,17 @@ __kernel void cumsum_##name##toU8_axis2( \
|
|||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord_out, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(exclusive) \
|
||||
{ \
|
||||
coord_out.z = 0; \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
image_write(output, coord_out, dst); \
|
||||
for(coord.z = 0; coord.z < channel - 1; coord.z++) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
coord_out.z++; \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
|
|
@ -125,45 +125,44 @@ __kernel void cumsum_##name##toU8_axis2( \
|
|||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord_out, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(rev) \
|
||||
{ \
|
||||
for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord, dst); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for(coord.z = 0; coord.z < channel; coord.z++) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord, dst); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)
|
||||
CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)
|
||||
|
||||
|
||||
CUMSUM_toINT_AXIS2_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_toINT_AXIS2_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_toINT_AXIS2_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||
|
||||
__kernel void cumsum_F32toF32_axis1(
|
||||
__read_only image2d_array_t input,
|
||||
|
|
@ -233,10 +232,10 @@ __kernel void cumsum_F32toF32_axis1(
|
|||
}
|
||||
}
|
||||
|
||||
#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \
|
||||
__kernel void cumsum_##name##toU8_axis1( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
#define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||
__kernel void cumsum_##name##_axis1( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
int exclusive, \
|
||||
int rev, \
|
||||
|
|
@ -253,20 +252,20 @@ __kernel void cumsum_##name##toU8_axis1( \
|
|||
int4 coord_out = coord; \
|
||||
\
|
||||
src_type sum = (src_type)(0); \
|
||||
uint4 dst = (uint4)(0); \
|
||||
dst_type dst = (dst_type)(0); \
|
||||
int tmp_zp = convert_int_rte(output_zp); \
|
||||
dst.x = convert_uint_sat(tmp_zp); \
|
||||
dst.x = convert_dtype(tmp_zp); \
|
||||
\
|
||||
float cnt = 0; \
|
||||
\
|
||||
if(exclusive && rev) \
|
||||
{ \
|
||||
coord_out.y = height - 1; \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
image_write(output, coord_out, dst); \
|
||||
\
|
||||
for(coord.y = height - 1; coord.y > 0; coord.y--) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
cnt += 1.0f; \
|
||||
coord_out.y--; \
|
||||
sum += data; \
|
||||
|
|
@ -274,17 +273,17 @@ __kernel void cumsum_##name##toU8_axis1( \
|
|||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord_out, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(exclusive) \
|
||||
{ \
|
||||
coord_out.y = 0; \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
image_write(output, coord_out, dst); \
|
||||
for(coord.y = 0; coord.y < height - 1; coord.y++) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
cnt += 1.0f; \
|
||||
coord_out.y++; \
|
||||
sum += data; \
|
||||
|
|
@ -292,44 +291,44 @@ __kernel void cumsum_##name##toU8_axis1( \
|
|||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord_out, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(rev) \
|
||||
{ \
|
||||
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord, dst); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for(coord.y = 0; coord.y < height; coord.y++) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord, dst); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)
|
||||
CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)
|
||||
|
||||
CUMSUM_toINT_AXIS1_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_toINT_AXIS1_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_toINT_AXIS1_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||
|
||||
__kernel void cumsum_F32toF32_axis0(
|
||||
__read_only image2d_array_t input,
|
||||
|
|
@ -399,8 +398,8 @@ __kernel void cumsum_F32toF32_axis0(
|
|||
}
|
||||
}
|
||||
|
||||
#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \
|
||||
__kernel void cumsum_##name##toU8_axis0( \
|
||||
#define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||
__kernel void cumsum_##name##_axis0( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis, \
|
||||
|
|
@ -419,19 +418,19 @@ __kernel void cumsum_##name##toU8_axis0( \
|
|||
int4 coord_out = coord; \
|
||||
\
|
||||
src_type sum = (src_type)(0); \
|
||||
uint4 dst = (uint4)(0); \
|
||||
dst_type dst = (dst_type)(0); \
|
||||
int tmp_zp = convert_int_rte(output_zp); \
|
||||
dst.x = convert_uint_sat(tmp_zp); \
|
||||
dst.x = convert_dtype(tmp_zp); \
|
||||
\
|
||||
float cnt = 0; \
|
||||
\
|
||||
if(exclusive && rev) \
|
||||
{ \
|
||||
coord_out.x = width - 1; \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
image_write(output, coord_out, dst); \
|
||||
for(coord.x = width - 1; coord.x > 0; coord.x--) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
coord_out.x--; \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
|
|
@ -439,8 +438,8 @@ __kernel void cumsum_##name##toU8_axis0( \
|
|||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord_out, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(exclusive) \
|
||||
|
|
@ -449,7 +448,7 @@ __kernel void cumsum_##name##toU8_axis0( \
|
|||
write_imageui(output, coord_out, dst); \
|
||||
for(coord.x = 0; coord.x < width - 1; coord.x++) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
coord_out.x++; \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
|
|
@ -457,40 +456,42 @@ __kernel void cumsum_##name##toU8_axis0( \
|
|||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord_out, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord_out, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(rev) \
|
||||
{ \
|
||||
for(coord.x = width - 1; coord.x >= 0; coord.x--) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord, dst); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for(coord.x = 0; coord.x < width; coord.x++) \
|
||||
{ \
|
||||
src_type data = read_image_type(input, coord); \
|
||||
src_type data = image_read(input, coord); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = (uint)convert_int_rte(tmpSum); \
|
||||
write_imageui(output, coord, dst); \
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord, dst); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)
|
||||
CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)
|
||||
CUMSUM_toINT_AXIS0_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_toINT_AXIS0_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_toINT_AXIS0_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||
|
||||
|
|
|
|||
|
|
@ -65,188 +65,100 @@ __kernel void cumsum_F32toF32_axis1_2D(
|
|||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_U8toU8_axis1_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int chn,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
uint4 sum = (uint4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
|
||||
int tmp_zp = convert_int_rte(output_zp);
|
||||
dst.x = convert_uint_sat(tmp_zp);
|
||||
|
||||
float cnt = 0;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.w = height - 1;
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(coord.y = height - 1; coord.y > 0; coord.y--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
coord.w--;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(coord.y = 0; coord.y < height - 1; coord.y++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
coord.w++;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.y = height - 1; coord.y >= 0; coord.y--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_F32toU8_axis1_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int chn,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
int tmp_zp = convert_int_rte(output_zp);
|
||||
dst.x = convert_uint_sat(tmp_zp);
|
||||
|
||||
float cnt = 0;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.w = height - 1;
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(coord.y = height - 1; coord.y > 0; coord.y--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
coord.w--;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(coord.y = 0; coord.y < height - 1; coord.y++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
coord.w++;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.y = height - 1; coord.y >= 0; coord.y--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
#define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||
__kernel void cumsum_##name##_axis1_2D( \
|
||||
__read_only image2d_t input, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int exclusive, \
|
||||
int rev, \
|
||||
int width, \
|
||||
int height, \
|
||||
int chn, \
|
||||
int input_zp, \
|
||||
float in_out_scale, \
|
||||
float in_out_zp_scale, \
|
||||
float output_zp \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
|
||||
\
|
||||
src_type sum = (src_type)(0); \
|
||||
dst_type dst = (dst_type)(0); \
|
||||
int tmp_zp = convert_int_rte(output_zp); \
|
||||
dst.x = convert_dtype(tmp_zp); \
|
||||
\
|
||||
float cnt = 0; \
|
||||
\
|
||||
if(exclusive && rev) \
|
||||
{ \
|
||||
coord.w = height - 1; \
|
||||
image_write(output, coord.zw, dst); \
|
||||
for(coord.y = height - 1; coord.y > 0; coord.y--) \
|
||||
{ \
|
||||
src_type data = image_read(input, coord.xy); \
|
||||
cnt += 1.0f; \
|
||||
coord.w--; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord.zw, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(exclusive) \
|
||||
{ \
|
||||
image_write(output, coord.zw, dst); \
|
||||
for(coord.y = 0; coord.y < height - 1; coord.y++) \
|
||||
{ \
|
||||
src_type data = image_read(input, coord.xy); \
|
||||
cnt += 1.0f; \
|
||||
coord.w++; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord.zw, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(rev) \
|
||||
{ \
|
||||
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
|
||||
{ \
|
||||
src_type data = image_read(input, coord.xy); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord.xy, dst); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for(coord.y = 0; coord.y < height; coord.y++) \
|
||||
{ \
|
||||
src_type data = image_read(input, coord.xy); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord.xy, dst); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
CUMSUM_INT_AXIS1_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_INT_AXIS1_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||
|
||||
__kernel void cumsum_F32toF32_axis0_2D(
|
||||
__read_only image2d_t input,
|
||||
|
|
@ -316,188 +228,100 @@ __kernel void cumsum_F32toF32_axis0_2D(
|
|||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_U8toU8_axis0_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int chn,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
uint4 sum = (uint4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
|
||||
int tmp_zp = convert_int_rte(output_zp);
|
||||
dst.x = convert_uint_sat(tmp_zp);
|
||||
|
||||
float cnt = 0.0f;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.x = width - 1;
|
||||
coord.z = coord.x;
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(; coord.x > 0; coord.x--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
coord.z--;
|
||||
cnt += 1.0;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord.z = 0;
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(coord.x = 0; coord.x < width - 1; coord.x++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
coord.z++;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.x = width - 1; coord.x >= 0; coord.x--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.x = 0; coord.x < width; coord.x++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_F32toU8_axis0_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int chn,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
int tmp_zp = convert_int_rte(output_zp);
|
||||
dst.x = convert_uint_sat(tmp_zp);
|
||||
|
||||
float cnt = 0.0f;
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.x = width - 1;
|
||||
coord.z = coord.x;
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(; coord.x > 0; coord.x--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
coord.z--;
|
||||
cnt += 1.0;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord.z = 0;
|
||||
write_imageui(output, coord.zw, dst);
|
||||
for(coord.x = 0; coord.x < width - 1; coord.x++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
coord.z++;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.x = width - 1; coord.x >= 0; coord.x--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.x = 0; coord.x < width; coord.x++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
#define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
|
||||
__kernel void cumsum_##name##_axis0_2D( \
|
||||
__read_only image2d_t input, \
|
||||
__write_only image2d_t output, \
|
||||
int axis, \
|
||||
int exclusive, \
|
||||
int rev, \
|
||||
int width, \
|
||||
int height, \
|
||||
int chn, \
|
||||
int input_zp, \
|
||||
float in_out_scale, \
|
||||
float in_out_zp_scale, \
|
||||
float output_zp \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
|
||||
\
|
||||
src_type sum = (src_type)(0); \
|
||||
dst_type dst = (dst_type)(0); \
|
||||
\
|
||||
int tmp_zp = convert_int_rte(output_zp); \
|
||||
dst.x = convert_dtype(tmp_zp); \
|
||||
\
|
||||
float cnt = 0.0f; \
|
||||
\
|
||||
if(exclusive && rev) \
|
||||
{ \
|
||||
coord.x = width - 1; \
|
||||
coord.z = coord.x; \
|
||||
image_write(output, coord.zw, dst); \
|
||||
for(; coord.x > 0; coord.x--) \
|
||||
{ \
|
||||
src_type data = image_read(input, coord.xy); \
|
||||
coord.z--; \
|
||||
cnt += 1.0; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord.zw, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(exclusive) \
|
||||
{ \
|
||||
coord.z = 0; \
|
||||
image_write(output, coord.zw, dst); \
|
||||
for(coord.x = 0; coord.x < width - 1; coord.x++) \
|
||||
{ \
|
||||
src_type data = image_read(input, coord.xy); \
|
||||
cnt += 1.0f; \
|
||||
coord.z++; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord.zw, dst); \
|
||||
} \
|
||||
} \
|
||||
else if(rev) \
|
||||
{ \
|
||||
for(coord.x = width - 1; coord.x >= 0; coord.x--) \
|
||||
{ \
|
||||
src_type data = image_read(input, coord.xy); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord.xy, dst); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for(coord.x = 0; coord.x < width; coord.x++) \
|
||||
{ \
|
||||
src_type data = image_read(input, coord.xy); \
|
||||
cnt += 1.0f; \
|
||||
sum += data; \
|
||||
\
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
|
||||
\
|
||||
dst.x = convert_dtype(tmpSum); \
|
||||
image_write(output, coord.xy, dst); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
CUMSUM_INT_AXIS0_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_INT_AXIS0_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
|
||||
CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
|
||||
|
|
|
|||
|
|
@ -132,3 +132,30 @@ __kernel void one_hot_U8toU8
|
|||
coord.z ++;
|
||||
} while (coord.z < depth);
|
||||
}
|
||||
|
||||
__kernel void one_hot_I32toBF16
|
||||
(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int depth,
|
||||
uint on_value,
|
||||
uint off_value,
|
||||
float inputScale,
|
||||
float inputTail
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
|
||||
|
||||
int4 src = read_imagei(input, coord.xy);
|
||||
|
||||
int val = convert_int(convert_float(src.x) * inputScale - inputTail);
|
||||
do
|
||||
{
|
||||
uint4 dst;
|
||||
dst.x = val == coord.z ? on_value : off_value;
|
||||
|
||||
write_imageui(output, coord.xzyw, dst.xxxx);
|
||||
|
||||
coord.z ++;
|
||||
} while (coord.z < depth);
|
||||
}
|
||||
|
|
@ -0,0 +1,373 @@
|
|||
__kernel void rope_F32_F32toF32_axis0
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float input_zp,
|
||||
float cos_zp,
|
||||
float sin_zp,
|
||||
float scale0,
|
||||
float scale1,
|
||||
float output_zp,
|
||||
int half_head_size,
|
||||
int step
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
float4 cos, sin;
|
||||
|
||||
READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
|
||||
READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
|
||||
coord.x = coord.x * step;
|
||||
float4 src0 = read_imagef(input, coord);
|
||||
int4 coord_out = coord;
|
||||
|
||||
coord.x += half_head_size;
|
||||
float4 src1 = read_imagef(input, coord);
|
||||
|
||||
float4 dst0 = src0 * cos - src1 * sin;
|
||||
float4 dst1 = src0 * sin + src1 * cos;
|
||||
|
||||
write_imagef(output, coord_out, dst0);
|
||||
coord_out.x += half_head_size;
|
||||
write_imagef(output, coord_out, dst1);
|
||||
}
|
||||
|
||||
__kernel void rope_F32_F32toF32_axis1
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float input_zp,
|
||||
float cos_zp,
|
||||
float sin_zp,
|
||||
float scale0,
|
||||
float scale1,
|
||||
float output_zp,
|
||||
int half_head_size,
|
||||
int step
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
float4 cos, sin;
|
||||
|
||||
READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
|
||||
READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
|
||||
coord.y = coord.y * step;
|
||||
float4 src0 = read_imagef(input, coord);
|
||||
int4 coord_out = coord;
|
||||
coord.y += half_head_size;
|
||||
float4 src1 = read_imagef(input, coord);
|
||||
|
||||
float4 dst0 = src0 * cos - src1 * sin;
|
||||
float4 dst1 = src0 * sin + src1 * cos;
|
||||
|
||||
write_imagef(output, coord_out, dst0);
|
||||
coord_out.y += half_head_size;
|
||||
write_imagef(output, coord_out, dst1);
|
||||
}
|
||||
|
||||
__kernel void rope_F32_F32toF32_axis2
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float input_zp,
|
||||
float cos_zp,
|
||||
float sin_zp,
|
||||
float scale0,
|
||||
float scale1,
|
||||
float output_zp,
|
||||
int half_head_size,
|
||||
int step
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
|
||||
float4 cos = read_imagef(cos_cache, coord);
|
||||
float4 sin = read_imagef(sin_cache, coord);
|
||||
coord.z = coord.z * step;
|
||||
float4 src0 = read_imagef(input, coord);
|
||||
int4 coord_out = coord;
|
||||
coord.z += half_head_size;
|
||||
float4 src1 = read_imagef(input, coord);
|
||||
|
||||
float4 dst0 = src0 * cos - src1 * sin;
|
||||
float4 dst1 = src0 * sin + src1 * cos;
|
||||
|
||||
write_imagef(output, coord_out, dst0);
|
||||
coord_out.z += half_head_size;
|
||||
write_imagef(output, coord_out, dst1);
|
||||
}
|
||||
|
||||
__kernel void rope_I32_I32toI32_axis0
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float input_zp,
|
||||
float cos_zp,
|
||||
float sin_zp,
|
||||
float scale0,
|
||||
float scale1,
|
||||
float output_zp,
|
||||
int half_head_size,
|
||||
int step
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 _cos, _sin;
|
||||
float4 cos, sin;
|
||||
|
||||
READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
|
||||
READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
|
||||
coord.x = coord.x * step;
|
||||
float4 src0 = convert_float4(read_imagei(input, coord));
|
||||
int4 coord_out = coord;
|
||||
|
||||
coord.x += half_head_size;
|
||||
float4 src1 = convert_float4(read_imagei(input, coord));
|
||||
|
||||
src0 = src0 - input_zp;
|
||||
src1 = src1 - input_zp;
|
||||
cos = convert_float4(_cos) - cos_zp;
|
||||
sin = convert_float4(_sin) - sin_zp;
|
||||
|
||||
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||
int4 dst0 = convert_int4_rte(_dst0);
|
||||
int4 dst1 = convert_int4_rte(_dst1);
|
||||
|
||||
write_imagei(output, coord_out, dst0);
|
||||
coord_out.x += half_head_size;
|
||||
write_imagei(output, coord_out, dst1);
|
||||
}
|
||||
|
||||
__kernel void rope_I32_I32toI32_axis1
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float input_zp,
|
||||
float cos_zp,
|
||||
float sin_zp,
|
||||
float scale0,
|
||||
float scale1,
|
||||
float output_zp,
|
||||
int half_head_size,
|
||||
int step
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 _cos, _sin;
|
||||
float4 cos, sin;
|
||||
|
||||
READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
|
||||
READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
|
||||
coord.y = coord.y * step;
|
||||
float4 src0 = convert_float4(read_imagei(input, coord));
|
||||
int4 coord_out = coord;
|
||||
|
||||
coord.y += half_head_size;
|
||||
float4 src1 = convert_float4(read_imagei(input, coord));
|
||||
|
||||
src0 = src0 - input_zp;
|
||||
src1 = src1 - input_zp;
|
||||
cos = convert_float4(_cos) - cos_zp;
|
||||
sin = convert_float4(_sin) - sin_zp;
|
||||
|
||||
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||
int4 dst0 = convert_int4_rte(_dst0);
|
||||
int4 dst1 = convert_int4_rte(_dst1);
|
||||
|
||||
write_imagei(output, coord_out, dst0);
|
||||
coord_out.y += half_head_size;
|
||||
write_imagei(output, coord_out, dst1);
|
||||
}
|
||||
|
||||
__kernel void rope_I32_I32toI32_axis2
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float input_zp,
|
||||
float cos_zp,
|
||||
float sin_zp,
|
||||
float scale0,
|
||||
float scale1,
|
||||
float output_zp,
|
||||
int half_head_size,
|
||||
int step
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
|
||||
float4 cos = convert_float4(read_imagei(cos_cache, coord));
|
||||
float4 sin = convert_float4(read_imagei(sin_cache, coord));
|
||||
coord.z = coord.z * step;
|
||||
float4 src0 = convert_float4(read_imagei(input, coord));
|
||||
int4 coord_out = coord;
|
||||
|
||||
coord.z += half_head_size;
|
||||
float4 src1 = convert_float4(read_imagei(input, coord));
|
||||
|
||||
src0 = src0 - input_zp;
|
||||
src1 = src1 - input_zp;
|
||||
cos = cos - cos_zp;
|
||||
sin = sin - sin_zp;
|
||||
|
||||
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||
int4 dst0 = convert_int4_rte(_dst0);
|
||||
int4 dst1 = convert_int4_rte(_dst1);
|
||||
|
||||
write_imagei(output, coord_out, dst0);
|
||||
coord_out.z += half_head_size;
|
||||
write_imagei(output, coord_out, dst1);
|
||||
}
|
||||
|
||||
__kernel void rope_U32_U32toU32_axis0
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float input_zp,
|
||||
float cos_zp,
|
||||
float sin_zp,
|
||||
float scale0,
|
||||
float scale1,
|
||||
float output_zp,
|
||||
int half_head_size,
|
||||
int step
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
uint4 _cos, _sin;
|
||||
float4 cos, sin;
|
||||
|
||||
READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
|
||||
READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
|
||||
coord.x = coord.x * step;
|
||||
float4 src0 = convert_float4(read_imageui(input, coord));
|
||||
int4 coord_out = coord;
|
||||
|
||||
coord.x += half_head_size;
|
||||
float4 src1 = convert_float4(read_imageui(input, coord));
|
||||
|
||||
src0 = src0 - input_zp;
|
||||
src1 = src1 - input_zp;
|
||||
cos = convert_float4(_cos) - cos_zp;
|
||||
sin = convert_float4(_sin) - sin_zp;
|
||||
|
||||
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||
uint4 dst0 = convert_uint4_rte(_dst0);
|
||||
uint4 dst1 = convert_uint4_rte(_dst1);
|
||||
|
||||
write_imageui(output, coord_out, dst0);
|
||||
coord_out.x += half_head_size;
|
||||
write_imageui(output, coord_out, dst1);
|
||||
}
|
||||
|
||||
__kernel void rope_U32_U32toU32_axis1
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float input_zp,
|
||||
float cos_zp,
|
||||
float sin_zp,
|
||||
float scale0,
|
||||
float scale1,
|
||||
float output_zp,
|
||||
int half_head_size,
|
||||
int step
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
uint4 _cos, _sin;
|
||||
float4 cos, sin;
|
||||
|
||||
READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
|
||||
READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
|
||||
coord.y = coord.y * step;
|
||||
float4 src0 = convert_float4(read_imageui(input, coord));
|
||||
int4 coord_out = coord;
|
||||
|
||||
coord.y += half_head_size;
|
||||
float4 src1 = convert_float4(read_imageui(input, coord));
|
||||
|
||||
src0 = src0 - input_zp;
|
||||
src1 = src1 - input_zp;
|
||||
cos = convert_float4(_cos) - cos_zp;
|
||||
sin = convert_float4(_sin) - sin_zp;
|
||||
|
||||
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||
uint4 dst0 = convert_uint4_rte(_dst0);
|
||||
uint4 dst1 = convert_uint4_rte(_dst1);
|
||||
|
||||
write_imageui(output, coord_out, dst0);
|
||||
coord_out.y += half_head_size;
|
||||
write_imageui(output, coord_out, dst1);
|
||||
}
|
||||
|
||||
__kernel void rope_U32_U32toU32_axis2
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
float input_zp,
|
||||
float cos_zp,
|
||||
float sin_zp,
|
||||
float scale0,
|
||||
float scale1,
|
||||
float output_zp,
|
||||
int half_head_size,
|
||||
int step
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
|
||||
float4 cos = convert_float4(read_imageui(cos_cache, coord));
|
||||
float4 sin = convert_float4(read_imageui(sin_cache, coord));
|
||||
coord.z = coord.z * step;
|
||||
float4 src0 = convert_float4(read_imageui(input, coord));
|
||||
int4 coord_out = coord;
|
||||
|
||||
coord.z += half_head_size;
|
||||
float4 src1 = convert_float4(read_imageui(input, coord));
|
||||
|
||||
src0 = src0 - input_zp;
|
||||
src1 = src1 - input_zp;
|
||||
cos = cos - cos_zp;
|
||||
sin = sin - sin_zp;
|
||||
|
||||
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
|
||||
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
|
||||
uint4 dst0 = convert_uint4_rte(_dst0);
|
||||
uint4 dst1 = convert_uint4_rte(_dst1);
|
||||
|
||||
write_imageui(output, coord_out, dst0);
|
||||
coord_out.z += half_head_size;
|
||||
write_imageui(output, coord_out, dst1);
|
||||
}
|
||||
|
|
@ -0,0 +1,307 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform int top;
|
||||
_viv_uniform int left;
|
||||
_viv_uniform float out_scale_r;
|
||||
_viv_uniform float out_scale_g;
|
||||
_viv_uniform float out_scale_b;
|
||||
_viv_uniform float out_zp_r;
|
||||
_viv_uniform float out_zp_g;
|
||||
_viv_uniform float out_zp_b;
|
||||
_viv_uniform float pad_v_r;
|
||||
_viv_uniform float pad_v_g;
|
||||
_viv_uniform float pad_v_b;
|
||||
_viv_uniform float scale_w;
|
||||
_viv_uniform float scale_h;
|
||||
_viv_uniform int resize_max_w;
|
||||
_viv_uniform int resize_max_h;
|
||||
_viv_uniform int out_height;
|
||||
_viv_uniform int r_order;
|
||||
_viv_uniform int b_order;
|
||||
_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
|
||||
_viv_uniform VXC_512Bits uniLeftToFloat32_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
|
||||
__kernel void custom_letterbox_U8toU8
|
||||
(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int top_,
|
||||
int bottom_,
|
||||
int left_,
|
||||
int right_,
|
||||
float mean_r_,
|
||||
float mean_g_,
|
||||
float mean_b_,
|
||||
float scale_r_,
|
||||
float scale_g_,
|
||||
float scale_b_,
|
||||
int pad_r_,
|
||||
int pad_g_,
|
||||
int pad_b_,
|
||||
int reverse_channel
|
||||
)
|
||||
{
|
||||
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
|
||||
int2 coord = coord_out;
|
||||
uint4 dst = (uint4)(0,0,0,0);
|
||||
vxc_uchar8 result;
|
||||
|
||||
if (coord_out.x < left || coord_out.x >= resize_max_w ||
|
||||
coord_out.y < top || coord_out.y >= resize_max_h)
|
||||
{
|
||||
dst.x = convert_uint(pad_v_r);
|
||||
coord.y = coord_out.y + r_order;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
dst.x = convert_uint(pad_v_g);
|
||||
coord.y = coord_out.y + out_height;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
dst.x = convert_uint(pad_v_b);
|
||||
coord.y = coord_out.y + b_order;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
return;
|
||||
}
|
||||
|
||||
float in_x = convert_float(coord_out.x - left) * scale_w;
|
||||
float in_y = convert_float(coord_out.y - top) * scale_h;
|
||||
float left_x_f = floor(in_x);
|
||||
float top_y_f = floor(in_y);
|
||||
float x_lerp = in_x - left_x_f;
|
||||
float y_lerp = in_y - top_y_f;
|
||||
int left_x_idx = convert_int(left_x_f);
|
||||
int top_y_idx = convert_int(top_y_f);
|
||||
|
||||
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
|
||||
vxc_uchar8 top_data, bottom_data;
|
||||
|
||||
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
|
||||
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
float4 left4 = (float4)(0,0,0,0);
|
||||
float4 right4 = (float4)(0,0,0,0);
|
||||
float4 top4 = (float4)(0,0,0,0);
|
||||
float4 bottom4 = (float4)(0,0,0,0);
|
||||
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||
top4 = right4 * x_lerp + left4;
|
||||
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||
bottom4 = right4 * x_lerp + left4;
|
||||
float4 out = (bottom4 - top4) * y_lerp + top4;
|
||||
|
||||
dst.x = convert_uint(out.x * out_scale_r + out_zp_r );
|
||||
coord.y = coord_out.y + r_order;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
dst.x = convert_uint(out.y * out_scale_g + out_zp_g);
|
||||
coord.y = coord_out.y + out_height;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
dst.x = convert_uint(out.z * out_scale_b + out_zp_b);
|
||||
coord.y = coord_out.y + b_order;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void custom_letterbox_U8toI8
|
||||
(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int top_,
|
||||
int bottom_,
|
||||
int left_,
|
||||
int right_,
|
||||
float mean_r_,
|
||||
float mean_g_,
|
||||
float mean_b_,
|
||||
float scale_r_,
|
||||
float scale_g_,
|
||||
float scale_b_,
|
||||
int pad_r_,
|
||||
int pad_g_,
|
||||
int pad_b_,
|
||||
int reverse_channel
|
||||
)
|
||||
{
|
||||
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
|
||||
int2 coord = coord_out;
|
||||
int4 dst = (int4)(0,0,0,0);
|
||||
vxc_char8 result;
|
||||
|
||||
if (coord_out.x < left || coord_out.x >= resize_max_w ||
|
||||
coord_out.y < top || coord_out.y >= resize_max_h)
|
||||
{
|
||||
dst.x = convert_int(pad_v_r);
|
||||
coord.y = coord_out.y + r_order;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
dst.x = convert_int(pad_v_g);
|
||||
coord.y = coord_out.y + out_height;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
dst.x = convert_int(pad_v_b);
|
||||
coord.y = coord_out.y + b_order;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
return;
|
||||
}
|
||||
|
||||
float in_x = convert_float(coord_out.x - left) * scale_w;
|
||||
float in_y = convert_float(coord_out.y - top) * scale_h;
|
||||
float left_x_f = floor(in_x);
|
||||
float top_y_f = floor(in_y);
|
||||
float x_lerp = in_x - left_x_f;
|
||||
float y_lerp = in_y - top_y_f;
|
||||
int left_x_idx = convert_int(left_x_f);
|
||||
int top_y_idx = convert_int(top_y_f);
|
||||
|
||||
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
|
||||
vxc_char8 top_data, bottom_data;
|
||||
|
||||
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
|
||||
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
float4 left4 = (float4)(0,0,0,0);
|
||||
float4 right4 = (float4)(0,0,0,0);
|
||||
float4 top4 = (float4)(0,0,0,0);
|
||||
float4 bottom4 = (float4)(0,0,0,0);
|
||||
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||
top4 = right4 * x_lerp + left4;
|
||||
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||
bottom4 = right4 * x_lerp + left4;
|
||||
float4 out = (bottom4 - top4) * y_lerp + top4;
|
||||
|
||||
dst.x = convert_int(out.x * out_scale_r + out_zp_r);
|
||||
coord.y = coord_out.y + r_order;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
dst.x = convert_int(out.y * out_scale_g + out_zp_g);
|
||||
coord.y = coord_out.y + out_height;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
dst.x = convert_int(out.z * out_scale_b + out_zp_b);
|
||||
coord.y = coord_out.y + b_order;
|
||||
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
|
||||
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
__kernel void custom_letterbox_U8toF16
|
||||
(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int top_,
|
||||
int bottom_,
|
||||
int left_,
|
||||
int right_,
|
||||
float mean_r_,
|
||||
float mean_g_,
|
||||
float mean_b_,
|
||||
float scale_r_,
|
||||
float scale_g_,
|
||||
float scale_b_,
|
||||
int pad_r_,
|
||||
int pad_g_,
|
||||
int pad_b_,
|
||||
int reverse_channel
|
||||
)
|
||||
{
|
||||
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
|
||||
int2 coord = coord_out;
|
||||
half4 tmp;
|
||||
vxc_half8 dst_temp;
|
||||
vxc_ushort8 dst;
|
||||
|
||||
if (coord_out.x < left || coord_out.x >= resize_max_w ||
|
||||
coord_out.y < top || coord_out.y >= resize_max_h)
|
||||
{
|
||||
|
||||
float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0);
|
||||
_viv_asm(CONV, tmp, pad);
|
||||
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||
_viv_asm(COPY, dst, dst_temp, 16);
|
||||
coord.y = coord_out.y + r_order;
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
tmp.x = tmp.y;
|
||||
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||
_viv_asm(COPY, dst, dst_temp, 16);
|
||||
coord.y = coord_out.y + out_height;
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
tmp.x = tmp.z;
|
||||
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||
_viv_asm(COPY, dst, dst_temp, 16);
|
||||
coord.y = coord_out.y + b_order;
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
return;
|
||||
}
|
||||
|
||||
float in_x = convert_float(coord_out.x - left) * scale_w;
|
||||
float in_y = convert_float(coord_out.y - top) * scale_h;
|
||||
float left_x_f = floor(in_x);
|
||||
float top_y_f = floor(in_y);
|
||||
float x_lerp = in_x - left_x_f;
|
||||
float y_lerp = in_y - top_y_f;
|
||||
int left_x_idx = convert_int(left_x_f);
|
||||
int top_y_idx = convert_int(top_y_f);
|
||||
|
||||
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
|
||||
vxc_uchar8 top_data, bottom_data;
|
||||
|
||||
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
|
||||
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
float4 left4 = (float4)(0,0,0,0);
|
||||
float4 right4 = (float4)(0,0,0,0);
|
||||
float4 top4 = (float4)(0,0,0,0);
|
||||
float4 bottom4 = (float4)(0,0,0,0);
|
||||
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||
top4 = right4 * x_lerp + left4;
|
||||
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
|
||||
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
|
||||
bottom4 = right4 * x_lerp + left4;
|
||||
float4 out = (bottom4 - top4) * y_lerp + top4;
|
||||
|
||||
float4 out_temp = (float4)(0,0,0,0);
|
||||
out_temp.x = out.x * out_scale_r + out_zp_r;
|
||||
_viv_asm(CONV, tmp, out_temp);
|
||||
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||
_viv_asm(COPY, dst, dst_temp, 16);
|
||||
coord.y = coord_out.y + r_order;
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
out_temp.x = out.y * out_scale_g + out_zp_g;
|
||||
_viv_asm(CONV, tmp, out_temp);
|
||||
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||
_viv_asm(COPY, dst, dst_temp, 16);
|
||||
coord.y = coord_out.y + out_height;
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
out_temp.x = out.z * out_scale_b + out_zp_b;
|
||||
_viv_asm(CONV, tmp, out_temp);
|
||||
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
|
||||
_viv_asm(COPY, dst, dst_temp, 16);
|
||||
coord.y = coord_out.y + out_height;
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -10,7 +10,12 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;
|
||||
_viv_uniform VXC_512Bits uniExtract8Bin_2x8;
|
||||
_viv_uniform int sf_size;
|
||||
_viv_uniform float srcScale;
|
||||
_viv_uniform float srcZP;
|
||||
_viv_uniform float dstScale;
|
||||
_viv_uniform float dstZP;
|
||||
#define F_MAX(a,b) ((a)>(b)?(a):(b))
|
||||
__kernel void Softmax2VXC
|
||||
(
|
||||
|
|
@ -19,35 +24,37 @@ __kernel void Softmax2VXC
|
|||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord_in = (int4)(0,0,0,0);
|
||||
float fMax = 0.0;
|
||||
int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
|
||||
float fMax = 0;
|
||||
for (int i = 0; i < sf_size; i++)
|
||||
{
|
||||
vxc_char8 val;
|
||||
vxc_short8 val;
|
||||
vxc_half8 val_h;
|
||||
coord_in.x = i;
|
||||
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, val_h, val, 16);
|
||||
float fval;
|
||||
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||
|
||||
fMax = F_MAX(fMax, fval);
|
||||
}
|
||||
|
||||
float fProbSum = 0.0f;
|
||||
vxc_short8 dst;
|
||||
for (int i = 0; i < sf_size; i++)
|
||||
{
|
||||
vxc_char8 val;
|
||||
|
||||
vxc_short8 val;
|
||||
vxc_half8 val_h;
|
||||
coord_in.x = i;
|
||||
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, val_h, val, 16);
|
||||
float fval;
|
||||
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||
|
||||
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||
float fOut = (float)exp(fval - fMax);
|
||||
fProbSum += fOut;
|
||||
half hVal;
|
||||
_viv_asm(CONV,hVal,fOut);
|
||||
_viv_asm(COPY,dst,hVal, 4);
|
||||
_viv_asm(CONV, hVal, fOut);
|
||||
_viv_asm(COPY, dst, hVal, 4);
|
||||
|
||||
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
|
|
@ -56,15 +63,68 @@ __kernel void Softmax2VXC
|
|||
vxc_short8 val;
|
||||
vxc_half8 val_h;
|
||||
coord_in.x = i;
|
||||
VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
float fval;
|
||||
_viv_asm(COPY, val_h,val, 16);
|
||||
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||
|
||||
float fOut =fval/fProbSum;
|
||||
float fOut =fval / fProbSum;
|
||||
half hVal;
|
||||
_viv_asm(CONV,hVal,fOut);
|
||||
_viv_asm(CONV, hVal, fOut);
|
||||
_viv_asm(COPY,dst,hVal, 4);
|
||||
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void Softmax2VXC_u8
|
||||
(
|
||||
image2d_array_t input,
|
||||
image2d_array_t output,
|
||||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
|
||||
float fMax = -3.4e38f;
|
||||
for (int i = 0; i < sf_size; i++)
|
||||
{
|
||||
vxc_uchar8 val;
|
||||
coord_in.x = i;
|
||||
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
float fval;
|
||||
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||
fval = (fval - srcZP) * srcScale;
|
||||
fMax = F_MAX(fMax, fval);
|
||||
}
|
||||
|
||||
float fProbSum = 0.0f;
|
||||
vxc_uchar8 dst;
|
||||
for (int i = 0; i < sf_size; i++)
|
||||
{
|
||||
vxc_uchar8 val;
|
||||
|
||||
coord_in.x = i;
|
||||
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
float fval;
|
||||
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||
fval = (fval - srcZP) * srcScale;
|
||||
float fOut = (float)exp(fval - fMax);
|
||||
fProbSum += fOut;
|
||||
}
|
||||
|
||||
for (int i = 0; i < sf_size; i++)
|
||||
{
|
||||
vxc_uchar8 val;
|
||||
coord_in.x = i;
|
||||
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
float fval;
|
||||
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
|
||||
fval = (fval - srcZP) * srcScale;
|
||||
|
||||
float fOut = exp(fval - fMax) / fProbSum;
|
||||
|
||||
fOut = fOut * dstScale + dstZP;
|
||||
short dst0;
|
||||
_viv_asm(CONV, dst0, fOut);
|
||||
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8);
|
||||
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -16,7 +16,7 @@ _viv_uniform float sum_x2_tail1;
|
|||
_viv_uniform float output_scale;
|
||||
_viv_uniform float output_zp;
|
||||
|
||||
#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \
|
||||
#define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
|
|
@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
|||
int lidx = get_local_id(0); \
|
||||
int gidz = get_global_id(1); \
|
||||
int4 coord = (int4)(gidx, 0, gidz, 0); \
|
||||
vxc_short8 src0; \
|
||||
load_type src; \
|
||||
src_type in_h; \
|
||||
float4 sumsqr; \
|
||||
float4 tmpSumSqr = (float4)(0); \
|
||||
|
|
@ -43,9 +43,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
|||
{ \
|
||||
for(coord.y = 0; coord.y < height;) \
|
||||
{ \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord.y++; \
|
||||
_viv_asm(COPY, in_h, src0, 16); \
|
||||
_viv_asm(COPY, in_h, src, 16); \
|
||||
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
|
||||
tmpSumSqr += sumsqr; \
|
||||
} \
|
||||
|
|
@ -76,10 +76,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
|||
write_imagef(output, coord_out, data); \
|
||||
} \
|
||||
}
|
||||
GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)
|
||||
GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
|
||||
GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8, vxc_half8)
|
||||
GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8, vxc_short8)
|
||||
GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8)
|
||||
|
||||
#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \
|
||||
#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
|
||||
__read_only image2d_array_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
|
|
@ -89,7 +90,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
|||
int lidx = get_local_id(0); \
|
||||
\
|
||||
int2 coord = (int2)(gidx, get_global_id(1)); \
|
||||
vxc_short8 src0; \
|
||||
load_type src; \
|
||||
src_type in_h; \
|
||||
float4 sumsqr = (float4)(0); \
|
||||
\
|
||||
|
|
@ -98,8 +99,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
|||
\
|
||||
if(gidx < width) \
|
||||
{ \
|
||||
VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, in_h, src0, 16); \
|
||||
VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, in_h, src, 16); \
|
||||
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
|
||||
sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \
|
||||
sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \
|
||||
|
|
@ -128,8 +129,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
|
|||
write_imagef(output, coord_out, data); \
|
||||
} \
|
||||
}
|
||||
GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
|
||||
GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
|
||||
GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8)
|
||||
GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8)
|
||||
GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8)
|
||||
|
||||
#define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
|
||||
|
|
@ -178,7 +180,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
|
|||
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
||||
norm = alpha * tmpData1 + bias_val; \
|
||||
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, outval, dst, 16); \
|
||||
VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
|
|
@ -230,10 +232,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
|
|||
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||
float4 norm; \
|
||||
norm = alpha * tmpData0 + bias_val; \
|
||||
_viv_asm(CONV, tmpVal0, norm); \
|
||||
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
||||
norm = alpha * tmpData1 + bias_val; \
|
||||
_viv_asm(CONV, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, outval, dst, 16); \
|
||||
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
|
|
@ -283,7 +285,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
|
|||
\
|
||||
float4 norm; \
|
||||
norm = alpha * tmpData0 + bias_val; \
|
||||
_viv_asm(CONV, tmpVal0, norm); \
|
||||
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
||||
norm = alpha * tmpData1 + bias_val; \
|
||||
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
|
|
@ -296,6 +298,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int
|
|||
GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
|
||||
GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
|
||||
GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
|
||||
GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
|
||||
|
||||
#define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
|
||||
|
|
@ -333,10 +336,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
|
|||
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
|
||||
float4 norm; \
|
||||
norm = alpha * tmpData0 + bias_val; \
|
||||
_viv_asm(CONV, tmpVal0, norm); \
|
||||
_viv_asm(CONV_RTE, tmpVal0, norm); \
|
||||
norm = alpha * tmpData1 + bias_val; \
|
||||
_viv_asm(CONV, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
|
||||
_viv_asm(CONV_RTE, tmpVal1, norm); \
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
_viv_asm(COPY, outval, dst, 16); \
|
||||
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
|
|
@ -346,4 +349,5 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8,
|
|||
GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
|
||||
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
|
||||
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
|
||||
GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
|
||||
|
||||
|
|
|
|||
|
|
@ -115,45 +115,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;
|
|||
_viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;
|
||||
_viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtact8Bin_2x8;
|
||||
_viv_uniform int inputZP0;
|
||||
_viv_uniform int inputZP1;
|
||||
_viv_uniform float input_scale0;
|
||||
_viv_uniform float input_scale1;
|
||||
_viv_uniform float outputZP;
|
||||
#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
|
||||
__kernel void prelu_##name0##to##name1( \
|
||||
_viv_uniform int input0_zp;
|
||||
_viv_uniform int input1_zp;
|
||||
_viv_uniform float input0_scale;
|
||||
_viv_uniform float input1_scale;
|
||||
_viv_uniform float output_zp;
|
||||
#define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
|
||||
__kernel void prelu_##name( \
|
||||
__read_only image2d_array_t input0, \
|
||||
__read_only image2d_array_t input1, \
|
||||
__write_only image2d_array_t output) \
|
||||
{\
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
|
||||
vxc_float4 vecA, vecB, vecC, vecD;\
|
||||
float4 vecA, vecB, vecC, vecD;\
|
||||
input_type0 srcA;\
|
||||
copy_type0 src0;\
|
||||
vxc_short8 srcB;\
|
||||
vxc_half8 src1;\
|
||||
input_type0 input_ZP;\
|
||||
input_type0 zp;\
|
||||
VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||
_viv_asm(COPY, src0, srcA, 16); \
|
||||
VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||
_viv_asm(COPY, src1, srcB, 16); \
|
||||
\
|
||||
_viv_asm(COPY, input_ZP, inputZP0, 4);\
|
||||
VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
|
||||
_viv_asm(COPY, zp, input0_zp, 4);\
|
||||
VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
|
||||
uniDataSubZPtoFp32Part0_4x4); \
|
||||
VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
|
||||
VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
|
||||
uniDataSubZPtoFp32Part1_4x4);\
|
||||
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
|
||||
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
|
||||
\
|
||||
vecA = vecA * input_scale0;\
|
||||
vecB = vecB * input_scale0;\
|
||||
vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
|
||||
vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
|
||||
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
||||
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
||||
vecA = maxData0 + vecC * minData0 + outputZP;\
|
||||
vecB = maxData1 + vecD * minData1 + outputZP;\
|
||||
vecA = vecA * input0_scale;\
|
||||
vecB = vecB * input0_scale;\
|
||||
float4 maxData0 = vecA > 0 ? vecA : 0.0; \
|
||||
float4 maxData1 = vecB > 0 ? vecB : 0.0; \
|
||||
float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
||||
float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
||||
vecA = maxData0 + vecC * minData0 + output_zp;\
|
||||
vecB = maxData1 + vecD * minData1 + output_zp;\
|
||||
convert_type dst0, dst1;\
|
||||
_viv_asm(CONV_RTE, dst0, vecA);\
|
||||
_viv_asm(CONV_RTE, dst1, vecB);\
|
||||
|
|
@ -164,49 +164,49 @@ _viv_uniform float outputZP;
|
|||
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||
}
|
||||
// name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type
|
||||
PRELU_F16_3D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
|
||||
PRELU_F16_3D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_3D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
|
||||
PRELU_F16_3D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_3D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
||||
PRELU_F16_3D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_3D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_3D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
|
||||
PRELU_F16_3D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
|
||||
PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
|
||||
PRELU_F16_3D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
|
||||
PRELU_F16_3D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_3D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
|
||||
PRELU_F16_3D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_3D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
||||
PRELU_F16_3D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_3D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_3D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
|
||||
PRELU_F16_3D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
|
||||
PRELU_F16_3D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
|
||||
|
||||
#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
|
||||
__kernel void prelu_##name0##to##name1##_2D( \
|
||||
#define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
|
||||
__kernel void prelu_##name##_2D( \
|
||||
__read_only image2d_array_t input0, \
|
||||
__read_only image2d_array_t input1, \
|
||||
__write_only image2d_array_t output) \
|
||||
{\
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));\
|
||||
vxc_float4 vecA, vecB, vecC, vecD;\
|
||||
float4 vecA, vecB, vecC, vecD;\
|
||||
input_type0 srcA;\
|
||||
copy_type0 src0;\
|
||||
vxc_short8 srcB;\
|
||||
vxc_half8 src1;\
|
||||
input_type0 input_ZP;\
|
||||
input_type0 zp;\
|
||||
VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||
_viv_asm(COPY, src0, srcA, 16); \
|
||||
VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||
_viv_asm(COPY, src1, srcB, 16); \
|
||||
\
|
||||
_viv_asm(COPY, input_ZP, inputZP0, 4);\
|
||||
VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
||||
VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
||||
_viv_asm(COPY, zp, input0_zp, 4);\
|
||||
VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
||||
VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
||||
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
|
||||
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
|
||||
\
|
||||
vecA = vecA * input_scale0;\
|
||||
vecB = vecB * input_scale0;\
|
||||
vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
|
||||
vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
|
||||
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
||||
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
||||
vecA = maxData0 + vecC * minData0 + outputZP;\
|
||||
vecB = maxData1 + vecD * minData1 + outputZP;\
|
||||
vecA = vecA * input0_scale;\
|
||||
vecB = vecB * input0_scale;\
|
||||
float4 maxData0 = vecA > 0 ? vecA : 0.0; \
|
||||
float4 maxData1 = vecB > 0 ? vecB : 0.0; \
|
||||
float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
||||
float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
||||
vecA = maxData0 + vecC * minData0 + output_zp;\
|
||||
vecB = maxData1 + vecD * minData1 + output_zp;\
|
||||
convert_type dst0, dst1;\
|
||||
_viv_asm(CONV_RTE, dst0, vecA);\
|
||||
_viv_asm(CONV_RTE, dst1, vecB);\
|
||||
|
|
@ -216,49 +216,49 @@ PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha
|
|||
_viv_asm(COPY, dst, dst2, 16); \
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||
}
|
||||
PRELU_F16_2D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_2D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
|
||||
PRELU_F16_2D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_2D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
||||
PRELU_F16_2D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_2D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_2D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
|
||||
PRELU_F16_2D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
|
||||
PRELU_F16_2D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
|
||||
PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
|
||||
PRELU_F16_2D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_2D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
|
||||
PRELU_F16_2D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_2D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
||||
PRELU_F16_2D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_2D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
|
||||
PRELU_F16_2D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
|
||||
PRELU_F16_2D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
|
||||
PRELU_F16_2D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
|
||||
PRELU_F16_2D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
|
||||
|
||||
#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \
|
||||
__kernel void prelu_U8U8to##name##_2D( \
|
||||
#define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \
|
||||
__kernel void prelu_##name##_2D( \
|
||||
__read_only image2d_array_t input0, \
|
||||
__read_only image2d_array_t input1, \
|
||||
__write_only image2d_array_t output) \
|
||||
{\
|
||||
int2 coord = (int2)(get_global_id(0), get_global_id(1));\
|
||||
vxc_float4 vecA, vecB, vecC, vecD;\
|
||||
vxc_uchar16 src0;\
|
||||
vxc_uchar16 src1;\
|
||||
vxc_uchar16 input_ZP0;\
|
||||
vxc_uchar16 input_ZP1;\
|
||||
float4 vecA, vecB, vecC, vecD;\
|
||||
src0_type src0;\
|
||||
src1_type src1;\
|
||||
short zp0;\
|
||||
short zp1;\
|
||||
VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||
VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||
\
|
||||
_viv_asm(COPY, input_ZP0, inputZP0, 4);\
|
||||
VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
||||
VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
||||
_viv_asm(COPY, input_ZP1, inputZP1, 4);\
|
||||
VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
||||
VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
||||
_viv_asm(COPY, zp0, input0_zp, 2);\
|
||||
VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
||||
VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
||||
_viv_asm(COPY, zp1, input1_zp, 4);\
|
||||
VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
|
||||
VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
|
||||
\
|
||||
vecA = vecA * input_scale0;\
|
||||
vecB = vecB * input_scale0;\
|
||||
vecC = vecC * input_scale1;\
|
||||
vecD = vecD * input_scale1;\
|
||||
vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
|
||||
vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
|
||||
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
||||
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
||||
vecA = maxData0 + vecC * minData0 + outputZP;\
|
||||
vecB = maxData1 + vecD * minData1 + outputZP;\
|
||||
vecA = vecA * input0_scale;\
|
||||
vecB = vecB * input0_scale;\
|
||||
vecC = vecC * input1_scale;\
|
||||
vecD = vecD * input1_scale;\
|
||||
float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
|
||||
float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
|
||||
float4 minData0 = vecA < 0 ? vecA : 0.0; \
|
||||
float4 minData1 = vecB < 0 ? vecB : 0.0; \
|
||||
vecA = maxData0 + vecC * minData0 + output_zp;\
|
||||
vecB = maxData1 + vecD * minData1 + output_zp;\
|
||||
convert_type dst0, dst1;\
|
||||
_viv_asm(CONV_RTE, dst0, vecA);\
|
||||
_viv_asm(CONV_RTE, dst1, vecB);\
|
||||
|
|
@ -268,7 +268,8 @@ PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha
|
|||
_viv_asm(COPY, dst, dst2, 16); \
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
|
||||
}
|
||||
PRELU_U8_2D(U8, vxc_uchar16, int4, vxc_uchar16)
|
||||
PRELU_U8_2D(F16, vxc_half8, half4, vxc_short8)
|
||||
PRELU_INTEGER_2D(U8U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
|
||||
PRELU_INTEGER_2D(U8U8toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,181 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
|
||||
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
|
||||
_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;
|
||||
_viv_uniform int out_height;
|
||||
|
||||
__kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners,
|
||||
int half_pixel_centers
|
||||
)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
|
||||
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
|
||||
coord_in.x = (coord_out.x * 2 - 1) >> 2;
|
||||
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
|
||||
|
||||
vxc_uchar16 in0, in1, tmp, result;
|
||||
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
vxc_ushort8 multiplier;
|
||||
_viv_asm(COPY, multiplier, multAndoutZP, 16);
|
||||
|
||||
vxc_ushort8 dst0;
|
||||
while (coord_out.y < out_height)
|
||||
{
|
||||
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
|
||||
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
|
||||
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
|
||||
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
|
||||
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
|
||||
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
|
||||
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
|
||||
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
|
||||
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y += 2;
|
||||
coord_out.y++;
|
||||
}
|
||||
}
|
||||
|
||||
_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;
|
||||
__kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners,
|
||||
int half_pixel_centers
|
||||
)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
|
||||
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
|
||||
coord_in.x = (coord_out.x * 2 - 3) >> 3;
|
||||
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
|
||||
|
||||
vxc_uchar16 in0, in1, dst0, dst1;
|
||||
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
vxc_ushort8 multiplier;
|
||||
_viv_asm(COPY, multiplier, multAndoutZP, 16);
|
||||
|
||||
vxc_ushort8 tmp;
|
||||
while (coord_out.y < out_height)
|
||||
{
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y += 2;
|
||||
coord_out.y++;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
|
||||
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
|
||||
_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;
|
||||
_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;
|
||||
_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;
|
||||
_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;
|
||||
_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;
|
||||
_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;
|
||||
__kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners,
|
||||
int half_pixel_centers
|
||||
)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;
|
||||
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
|
||||
coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
|
||||
coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;
|
||||
|
||||
vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;
|
||||
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
vxc_ushort8 multiplier;
|
||||
_viv_asm(COPY, multiplier, multAndoutZP, 16);
|
||||
|
||||
vxc_ushort8 data;
|
||||
|
||||
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
|
||||
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
|
||||
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
|
||||
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
|
||||
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
|
||||
VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);
|
||||
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);
|
||||
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
|
||||
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
|
||||
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
|
||||
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
|
||||
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
|
||||
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
|
||||
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
|
||||
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
|
||||
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
|
||||
VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);
|
||||
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);
|
||||
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
|
||||
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
|
||||
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
|
||||
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
|
||||
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
|
||||
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
|
||||
_viv_uniform int out_height;
|
||||
_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;
|
||||
_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;
|
||||
__kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int align_corners,
|
||||
int half_pixel_centers
|
||||
)
|
||||
{
|
||||
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
|
||||
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
|
||||
coord_in.x = (coord_out.x * 2 - 7) >> 4;
|
||||
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
|
||||
|
||||
vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3;
|
||||
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
vxc_ushort8 multiplier;
|
||||
_viv_asm(COPY, multiplier, multAndoutZP, 16);
|
||||
|
||||
vxc_ushort8 tmp;
|
||||
while (coord_out.y < out_height)
|
||||
{
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
|
||||
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
|
||||
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
|
||||
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
|
||||
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
|
||||
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
|
||||
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
|
||||
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
|
||||
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
|
||||
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
|
||||
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
|
||||
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
|
||||
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y += 2;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
|
||||
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
|
||||
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
|
||||
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
|
||||
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
|
||||
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
|
||||
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_out.y++;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,303 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float scale0;
|
||||
_viv_uniform float scale1;
|
||||
_viv_uniform float output_zp;
|
||||
_viv_uniform int half_head_size;
|
||||
_viv_uniform VXC_512Bits uniATimesB_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniATimesB_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
|
||||
#define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \
|
||||
__kernel void rope_##name##_bnhs \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_array_t cos_cache, \
|
||||
__read_only image2d_array_t sin_cache, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||
int4 coord_out = coord_in; \
|
||||
\
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
\
|
||||
src_type data0, data1; \
|
||||
src1_type cos, sin; \
|
||||
copy_type v0, v1; \
|
||||
dst_type dst; \
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, cos, v0, 16); \
|
||||
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, sin, v1, 16); \
|
||||
coord_in.y += half_head_size; \
|
||||
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
\
|
||||
float4 data2, data3, data4, data5; \
|
||||
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||
data3 = data3 * scale0 - data5 * scale1 + output_zp; \
|
||||
\
|
||||
int4 dst0 = convert_int4_rte(data2); \
|
||||
int4 dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||
data2 = data2 * scale1 + data4 * scale0 + output_zp; \
|
||||
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||
\
|
||||
dst0 = convert_int4_rte(data2); \
|
||||
dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
coord_out.y += half_head_size; \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||
ROPE_BNHS_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
|
||||
ROPE_BNHS_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
|
||||
ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||
ROPE_BNHS_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
|
||||
ROPE_BNHS_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||
|
||||
__kernel void rope_F16_F16toF16_bnhs
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_out = coord_in;
|
||||
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
|
||||
vxc_short8 v0, v1, v2, v3, dst;
|
||||
vxc_half8 data0, data1, cos, sin, dst2;
|
||||
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, data0, v0, 16);
|
||||
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, cos, v1, 16);
|
||||
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, sin, v2, 16);
|
||||
coord_in.y += half_head_size;
|
||||
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, data1, v3, 16);
|
||||
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
float4 data2, data3, data4, data5;
|
||||
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||
data2 = data2 - data4;
|
||||
data3 = data3 - data5;
|
||||
|
||||
half4 dst0;
|
||||
half4 dst1;
|
||||
_viv_asm(CONV_RTE, dst0, data2);
|
||||
_viv_asm(CONV_RTE, dst1, data3);
|
||||
|
||||
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||
_viv_asm(COPY, dst, dst2, 16);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||
data2 = data2 * scale1 + data4 * scale0 + output_zp;
|
||||
data3 = data3 * scale1 + data5 * scale0 + output_zp;
|
||||
|
||||
_viv_asm(CONV_RTE, dst0, data2);
|
||||
_viv_asm(CONV_RTE, dst1, data3);
|
||||
|
||||
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||
_viv_asm(COPY, dst, dst2, 16);
|
||||
coord_out.y += half_head_size;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
_viv_uniform int in0_zp;
|
||||
_viv_uniform int cos_zp;
|
||||
_viv_uniform int sin_zp;
|
||||
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
|
||||
#define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \
|
||||
__kernel void rope_##name##_bnhs \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_array_t cos_cache, \
|
||||
__read_only image2d_array_t sin_cache, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||
int4 coord_out = coord_in; \
|
||||
\
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
\
|
||||
dtype data0, data1, dst; \
|
||||
src1_type cos, sin; \
|
||||
copy_type v0, v1; \
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, cos, v0, 16); \
|
||||
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, sin, v1, 16); \
|
||||
coord_in.y += half_head_size; \
|
||||
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
\
|
||||
float4 l00, l01, cos0, cos1; \
|
||||
float4 l10, l11, sin0, sin1; \
|
||||
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
|
||||
float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
|
||||
\
|
||||
int4 dst0 = convert_int4_rte(data2); \
|
||||
int4 dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, \
|
||||
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
|
||||
data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
|
||||
\
|
||||
dst0 = convert_int4_rte(data2); \
|
||||
dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
coord_out.y += half_head_size; \
|
||||
VXC_OP4_NoDest(img_store_3d, output, \
|
||||
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
ROPE_ASYM_BNHS(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
|
||||
ROPE_ASYM_BNHS(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
|
||||
ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
|
||||
ROPE_ASYM_BNHS(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
|
||||
ROPE_ASYM_BNHS(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||
ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
__kernel void rope_BF16_BF16toBF16_bnhs
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_out = coord_in;
|
||||
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
|
||||
vxc_ushort8 v0, v1, v2, v3, dst;
|
||||
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y += half_head_size;
|
||||
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
vxc_short8 data;
|
||||
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, src0, data, 16);
|
||||
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src1, data, 16);
|
||||
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, cos0, data, 16);
|
||||
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, cos1, data, 16);
|
||||
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, sin0, data, 16);
|
||||
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, sin1, data, 16);
|
||||
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, src2, data, 16);
|
||||
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src3, data, 16);
|
||||
|
||||
float4 data0 = src0 * cos0 - src2 * sin0;
|
||||
float4 data1 = src1 * cos1 - src3 * sin1;
|
||||
|
||||
_viv_asm(COPY, v0, data0, 16);
|
||||
_viv_asm(COPY, v1, data1, 16);
|
||||
|
||||
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
data0 = src0 * sin0 + src2 * cos0;
|
||||
data1 = src1 * sin1 + src3 * cos1;
|
||||
|
||||
_viv_asm(COPY, v0, data0, 16);
|
||||
_viv_asm(COPY, v1, data1, 16);
|
||||
|
||||
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
coord_out.y += half_head_size;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -0,0 +1,245 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float scale0;
|
||||
_viv_uniform float scale1;
|
||||
_viv_uniform float output_zp;
|
||||
_viv_uniform int half_head_size;
|
||||
_viv_uniform VXC_512Bits uniATimesB_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniATimesB_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
|
||||
#define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \
|
||||
__kernel void rope_##name##_bnh1 \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_array_t cos_cache, \
|
||||
__read_only image2d_array_t sin_cache, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
|
||||
\
|
||||
src_type data0, data1; \
|
||||
src1_type cos, sin; \
|
||||
copy_type v0, v1; \
|
||||
VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, cos, v0, 16); \
|
||||
VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, sin, v1, 16); \
|
||||
coord.x += half_head_size; \
|
||||
VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
float4 data2, data3, data4, data5; \
|
||||
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||
data3 = data3 * scale0 - data5 * scale1 + output_zp; \
|
||||
\
|
||||
int4 dst0 = convert_int4_rte(data2); \
|
||||
int4 dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
dst_type dst; \
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
|
||||
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
|
||||
data2 = data2 * scale1 + data4 * scale0 + output_zp; \
|
||||
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||
\
|
||||
dst0 = convert_int4_rte(data2); \
|
||||
dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||
ROPE_BNH1_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
|
||||
ROPE_BNH1_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
|
||||
ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||
ROPE_BNH1_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
|
||||
ROPE_BNH1_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||
|
||||
__kernel void rope_F16_F16toF16_bnh1
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
vxc_short8 v0, v1, v2, v3, dst;
|
||||
vxc_half8 data0, data1, cos, sin, dst2;
|
||||
VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, data0, v0, 16);
|
||||
VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, cos, v1, 16);
|
||||
VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, sin, v2, 16);
|
||||
coord.x += half_head_size;
|
||||
VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, data1, v3, 16);
|
||||
|
||||
float4 data2, data3, data4, data5;
|
||||
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||
data2 = data2 - data4;
|
||||
data3 = data3 - data5;
|
||||
|
||||
half4 dst0;
|
||||
half4 dst1;
|
||||
_viv_asm(CONV_RTE, dst0, data2);
|
||||
_viv_asm(CONV_RTE, dst1, data3);
|
||||
|
||||
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||
_viv_asm(COPY, dst, dst2, 16);
|
||||
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
|
||||
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
|
||||
data2 = data2 + data4;
|
||||
data3 = data3 + data5;
|
||||
|
||||
_viv_asm(CONV_RTE, dst0, data2);
|
||||
_viv_asm(CONV_RTE, dst1, data3);
|
||||
|
||||
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||
_viv_asm(COPY, dst, dst2, 16);
|
||||
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
_viv_uniform int in0_zp;
|
||||
_viv_uniform int cos_zp;
|
||||
_viv_uniform int sin_zp;
|
||||
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
|
||||
#define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \
|
||||
__kernel void rope_##name##_bnh1 \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_array_t cos_cache, \
|
||||
__read_only image2d_array_t sin_cache, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis \
|
||||
) \
|
||||
{ \
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
|
||||
\
|
||||
dtype data0, data1, dst; \
|
||||
src1_type cos, sin; \
|
||||
copy_type v0, v1; \
|
||||
VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, cos, v0, 16); \
|
||||
VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, sin, v1, 16); \
|
||||
coord.x += half_head_size; \
|
||||
VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
float4 l00, l01, cos0, cos1; \
|
||||
float4 l10, l11, sin0, sin1; \
|
||||
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
|
||||
float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
|
||||
\
|
||||
int4 dst0 = convert_int4_rte(data2); \
|
||||
int4 dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
|
||||
data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
|
||||
\
|
||||
dst0 = convert_int4_rte(data2); \
|
||||
dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
ROPE_ASYM_BNH1(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
|
||||
ROPE_ASYM_BNH1(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
|
||||
ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
|
||||
ROPE_ASYM_BNH1(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
|
||||
ROPE_ASYM_BNH1(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||
ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
__kernel void rope_BF16_BF16toBF16_bnh1
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
vxc_ushort8 v0, v1, v2, v3, dst;
|
||||
VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x += half_head_size;
|
||||
VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
vxc_short8 data;
|
||||
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, src0, data, 16);
|
||||
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src1, data, 16);
|
||||
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, cos0, data, 16);
|
||||
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, cos1, data, 16);
|
||||
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, sin0, data, 16);
|
||||
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, sin1, data, 16);
|
||||
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, src2, data, 16);
|
||||
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src3, data, 16);
|
||||
|
||||
float4 data0 = src0 * cos0 - src2 * sin0;
|
||||
float4 data1 = src1 * cos1 - src3 * sin1;
|
||||
|
||||
_viv_asm(COPY, v0, data0, 16);
|
||||
_viv_asm(COPY, v1, data1, 16);
|
||||
|
||||
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
data0 = src0 * sin0 + src2 * cos0;
|
||||
data1 = src1 * sin1 + src3 * cos1;
|
||||
|
||||
_viv_asm(COPY, v0, data0, 16);
|
||||
_viv_asm(COPY, v1, data1, 16);
|
||||
|
||||
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -0,0 +1,312 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float scale0;
|
||||
_viv_uniform float scale1;
|
||||
_viv_uniform float output_zp;
|
||||
_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
|
||||
#define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
|
||||
__kernel void rope_##name##_bsnh \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_array_t cos_cache, \
|
||||
__read_only image2d_array_t sin_cache, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||
\
|
||||
src_type data0, data1; \
|
||||
src1_type cos, sin; \
|
||||
copy_type v0, v1; \
|
||||
dst_type dst; \
|
||||
VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, cos, v0, 16); \
|
||||
VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, sin, v1, 16); \
|
||||
\
|
||||
coord_in.x *= 2; \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int4 coord_out = coord_in; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
\
|
||||
float4 data2, data3, data4, data5; \
|
||||
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
|
||||
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
|
||||
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
|
||||
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
|
||||
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||
\
|
||||
int4 dst0 = convert_int4_rte(data2); \
|
||||
int4 dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
|
||||
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
|
||||
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
|
||||
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
|
||||
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||
\
|
||||
dst0 = convert_int4_rte(data2); \
|
||||
dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
coord_out.x += 8; \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||
ROPE_BSNH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
|
||||
ROPE_BSNH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
|
||||
ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||
ROPE_BSNH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
|
||||
ROPE_BSNH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||
|
||||
__kernel void rope_F16_F16toF16_bsnh
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
|
||||
vxc_short8 v0, v1, v2, v3, dst;
|
||||
vxc_half8 data0, data1, cos, sin, dst2;
|
||||
VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, cos, v1, 16);
|
||||
VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, sin, v2, 16);
|
||||
|
||||
coord_in.x *= 2;
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
|
||||
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, data0, v0, 16);
|
||||
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, data1, v3, 16);
|
||||
|
||||
int4 coord_out = coord_in;
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
float4 data2, data3, data4, data5;
|
||||
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
|
||||
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
|
||||
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
|
||||
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
|
||||
data2 = data2 - data4;
|
||||
data3 = data3 + data5;
|
||||
|
||||
half4 dst0;
|
||||
half4 dst1;
|
||||
_viv_asm(CONV_RTE, dst0, data2);
|
||||
_viv_asm(CONV_RTE, dst1, data3);
|
||||
|
||||
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||
_viv_asm(COPY, dst, dst2, 16);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
|
||||
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
|
||||
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
|
||||
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
|
||||
data2 = data2 - data4;
|
||||
data3 = data3 + data5;
|
||||
|
||||
_viv_asm(CONV_RTE, dst0, data2);
|
||||
_viv_asm(CONV_RTE, dst1, data3);
|
||||
|
||||
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||
_viv_asm(COPY, dst, dst2, 16);
|
||||
coord_out.x += 8;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
_viv_uniform int in0_zp;
|
||||
_viv_uniform int cos_zp;
|
||||
_viv_uniform int sin_zp;
|
||||
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
|
||||
_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
|
||||
#define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \
|
||||
__kernel void rope_##name##_bsnh \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_array_t cos_cache, \
|
||||
__read_only image2d_array_t sin_cache, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||
\
|
||||
dtype data0, data1, dst; \
|
||||
src1_type cos, sin; \
|
||||
copy_type v0, v1; \
|
||||
\
|
||||
VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, cos, v0, 16); \
|
||||
VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, sin, v1, 16); \
|
||||
coord_in.x *= 2; \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int4 coord_out = coord_in; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
\
|
||||
float4 l00, l01, cos0, cos1; \
|
||||
float4 l10, l11, sin0, sin1; \
|
||||
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
|
||||
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
|
||||
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
|
||||
float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
|
||||
\
|
||||
int4 dst0 = convert_int4_rte(data2); \
|
||||
int4 dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, \
|
||||
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
|
||||
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
|
||||
data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
|
||||
data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
|
||||
\
|
||||
dst0 = convert_int4_rte(data2); \
|
||||
dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
coord_out.x += 8; \
|
||||
VXC_OP4_NoDest(img_store_3d, output, \
|
||||
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
ROPE_ASYM_BSNH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
|
||||
ROPE_ASYM_BSNH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
|
||||
ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
|
||||
ROPE_ASYM_BSNH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
|
||||
ROPE_ASYM_BSNH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||
ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
__kernel void rope_BF16_BF16toBF16_bsnh
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
|
||||
vxc_ushort8 v0, v1, v2, v3, dst;
|
||||
VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
coord_in.x *= 2;
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
int4 coord_out = coord_in;
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
vxc_short8 data;
|
||||
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, src0, data, 16);
|
||||
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src1, data, 16);
|
||||
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, cos0, data, 16);
|
||||
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, cos1, data, 16);
|
||||
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, sin0, data, 16);
|
||||
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, sin1, data, 16);
|
||||
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, src2, data, 16);
|
||||
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src3, data, 16);
|
||||
|
||||
float4 even = (float4)(src0.xz, src1.xz);
|
||||
float4 odd = (float4)(src0.yw, src1.yw);
|
||||
float4 data0 = even * cos0 - odd * sin0;
|
||||
float4 data1 = even * sin0 + odd * cos0;
|
||||
|
||||
_viv_asm(COPY, v0, data0, 16);
|
||||
_viv_asm(COPY, v1, data1, 16);
|
||||
|
||||
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
even = (float4)(src2.xz, src3.xz);
|
||||
odd = (float4)(src2.yw, src3.yw);
|
||||
data0 = even * cos1 - odd * sin1;
|
||||
data1 = even * sin1 + odd * cos1;
|
||||
|
||||
_viv_asm(COPY, v0, data0, 16);
|
||||
_viv_asm(COPY, v1, data1, 16);
|
||||
|
||||
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
coord_out.x += 8;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -0,0 +1,312 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float scale0;
|
||||
_viv_uniform float scale1;
|
||||
_viv_uniform float output_zp;
|
||||
_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
|
||||
|
||||
#define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
|
||||
__kernel void rope_##name##_bnsh \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_array_t cos_cache, \
|
||||
__read_only image2d_array_t sin_cache, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||
\
|
||||
src_type data0, data1; \
|
||||
src1_type cos, sin; \
|
||||
copy_type v0, v1; \
|
||||
dst_type dst; \
|
||||
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, cos, v0, 16); \
|
||||
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, sin, v1, 16); \
|
||||
\
|
||||
coord_in.x *= 2; \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int4 coord_out = coord_in; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
\
|
||||
float4 data2, data3, data4, data5; \
|
||||
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
|
||||
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
|
||||
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
|
||||
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
|
||||
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||
\
|
||||
int4 dst0 = convert_int4_rte(data2); \
|
||||
int4 dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
|
||||
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
|
||||
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
|
||||
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
|
||||
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
|
||||
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
|
||||
\
|
||||
dst0 = convert_int4_rte(data2); \
|
||||
dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
coord_out.x += 8; \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
|
||||
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
|
||||
ROPE_BNSH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
|
||||
ROPE_BNSH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
|
||||
ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
|
||||
ROPE_BNSH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
|
||||
ROPE_BNSH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||
|
||||
__kernel void rope_F16_F16toF16_bnsh
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
|
||||
vxc_short8 v0, v1, v2, v3, dst;
|
||||
vxc_half8 data0, data1, cos, sin, dst2;
|
||||
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, cos, v1, 16);
|
||||
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, sin, v2, 16);
|
||||
|
||||
coord_in.x *= 2;
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
|
||||
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, data0, v0, 16);
|
||||
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, data1, v3, 16);
|
||||
|
||||
int4 coord_out = coord_in;
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
float4 data2, data3, data4, data5;
|
||||
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
|
||||
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
|
||||
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
|
||||
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
|
||||
data2 = data2 - data4;
|
||||
data3 = data3 + data5;
|
||||
|
||||
half4 dst0;
|
||||
half4 dst1;
|
||||
_viv_asm(CONV_RTE, dst0, data2);
|
||||
_viv_asm(CONV_RTE, dst1, data3);
|
||||
|
||||
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||
_viv_asm(COPY, dst, dst2, 16);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
|
||||
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
|
||||
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
|
||||
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
|
||||
data2 = data2 - data4;
|
||||
data3 = data3 + data5;
|
||||
|
||||
_viv_asm(CONV_RTE, dst0, data2);
|
||||
_viv_asm(CONV_RTE, dst1, data3);
|
||||
|
||||
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
|
||||
_viv_asm(COPY, dst, dst2, 16);
|
||||
coord_out.x += 8;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
_viv_uniform int in0_zp;
|
||||
_viv_uniform int cos_zp;
|
||||
_viv_uniform int sin_zp;
|
||||
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
|
||||
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
|
||||
_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
|
||||
_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
|
||||
#define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \
|
||||
__kernel void rope_##name##_bnsh \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
__read_only image2d_array_t cos_cache, \
|
||||
__read_only image2d_array_t sin_cache, \
|
||||
__write_only image2d_array_t output, \
|
||||
int axis \
|
||||
) \
|
||||
{ \
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
|
||||
\
|
||||
dtype data0, data1, dst; \
|
||||
src1_type cos, sin; \
|
||||
copy_type v0, v1; \
|
||||
\
|
||||
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, cos, v0, 16); \
|
||||
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, sin, v1, 16); \
|
||||
coord_in.x *= 2; \
|
||||
int8 input_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord_in.w, baseAddr); \
|
||||
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
int4 coord_out = coord_in; \
|
||||
int8 output_desc; \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord_out.w, baseAddr); \
|
||||
\
|
||||
float4 l00, l01, cos0, cos1; \
|
||||
float4 l10, l11, sin0, sin1; \
|
||||
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
|
||||
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
|
||||
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
|
||||
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
|
||||
float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
|
||||
float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
|
||||
\
|
||||
int4 dst0 = convert_int4_rte(data2); \
|
||||
int4 dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, \
|
||||
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
\
|
||||
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
|
||||
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
|
||||
data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
|
||||
data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
|
||||
\
|
||||
dst0 = convert_int4_rte(data2); \
|
||||
dst1 = convert_int4_rte(data3); \
|
||||
\
|
||||
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
|
||||
coord_out.x += 8; \
|
||||
VXC_OP4_NoDest(img_store_3d, output, \
|
||||
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
ROPE_ASYM_BNSH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
|
||||
ROPE_ASYM_BNSH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
|
||||
ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
|
||||
ROPE_ASYM_BNSH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
|
||||
ROPE_ASYM_BNSH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
|
||||
ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
|
||||
__kernel void rope_BF16_BF16toBF16_bnsh
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t cos_cache,
|
||||
__read_only image2d_array_t sin_cache,
|
||||
__write_only image2d_array_t output,
|
||||
int axis
|
||||
)
|
||||
{
|
||||
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
|
||||
vxc_ushort8 v0, v1, v2, v3, dst;
|
||||
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
coord_in.x *= 2;
|
||||
int8 input_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.w, baseAddr);
|
||||
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
int4 coord_out = coord_in;
|
||||
int8 output_desc;
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
|
||||
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
|
||||
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
vxc_short8 data;
|
||||
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, src0, data, 16);
|
||||
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src1, data, 16);
|
||||
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, cos0, data, 16);
|
||||
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, cos1, data, 16);
|
||||
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, sin0, data, 16);
|
||||
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, sin1, data, 16);
|
||||
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
|
||||
_viv_asm(COPY, src2, data, 16);
|
||||
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, src3, data, 16);
|
||||
|
||||
float4 even = (float4)(src0.xz, src1.xz);
|
||||
float4 odd = (float4)(src0.yw, src1.yw);
|
||||
float4 data0 = even * cos0 - odd * sin0;
|
||||
float4 data1 = even * sin0 + odd * cos0;
|
||||
|
||||
_viv_asm(COPY, v0, data0, 16);
|
||||
_viv_asm(COPY, v1, data1, 16);
|
||||
|
||||
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
even = (float4)(src2.xz, src3.xz);
|
||||
odd = (float4)(src2.yw, src3.yw);
|
||||
data0 = even * cos1 - odd * sin1;
|
||||
data1 = even * sin1 + odd * cos1;
|
||||
|
||||
_viv_asm(COPY, v0, data0, 16);
|
||||
_viv_asm(COPY, v1, data1, 16);
|
||||
|
||||
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
coord_out.x += 8;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -93,3 +93,101 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \
|
|||
}
|
||||
SCATTER_ND_UPDATE_COPY2OUT(U8, vxc_uchar16, 1)
|
||||
SCATTER_ND_UPDATE_COPY2OUT(I8, vxc_char16, 1)
|
||||
SCATTER_ND_UPDATE_COPY2OUT(U16, vxc_ushort8, 2)
|
||||
SCATTER_ND_UPDATE_COPY2OUT(I16, vxc_short8, 2)
|
||||
|
||||
#define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \
|
||||
__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \
|
||||
__read_only image2d_t input_ref, \
|
||||
image2d_t temp_ref, \
|
||||
image2d_t output0 \
|
||||
) \
|
||||
{ \
|
||||
int gidx = get_global_id(0); \
|
||||
Image img0 = create_image_from_image2d(input_ref, 2); \
|
||||
Image img1 = create_image_from_image2d(temp_ref, 2); \
|
||||
__global data_type* in_ptr = (__global data_type*)img0.ptr; \
|
||||
__global data_type* out_ptr = (__global data_type*)img1.ptr; \
|
||||
data_type src, dst; \
|
||||
src = in_ptr[gidx]; \
|
||||
vxc_ushort8 mp0; \
|
||||
_viv_asm(COPY, mp0, multAndoutZP0, 16); \
|
||||
VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
|
||||
uniU8MulAndPostShift0_Lo_2x8); \
|
||||
out_ptr[gidx] = dst; \
|
||||
}
|
||||
SCATTER_ND_UPDATE_REF2OUT_16BITS(U16, vxc_ushort8)
|
||||
SCATTER_ND_UPDATE_REF2OUT_16BITS(I16, vxc_short8)
|
||||
|
||||
#define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \
|
||||
__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \
|
||||
__read_only image2d_t input_index, \
|
||||
__read_only image2d_t input_update, \
|
||||
image2d_t temp_ref, \
|
||||
image2d_t input0, \
|
||||
image2d_t output1, \
|
||||
int width, int area, int vol, int coord_dim \
|
||||
) \
|
||||
{ \
|
||||
int gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
\
|
||||
Image img1 = create_image_from_image2d(input_index, 4); \
|
||||
Image img2 = create_image_from_image2d(input_update, 2); \
|
||||
Image img3 = create_image_from_image2d(temp_ref, 2); \
|
||||
__global int* index_ptr = (__global int*)img1.ptr; \
|
||||
__global data_type* update_ptr = (__global data_type*)img2.ptr; \
|
||||
__global data_type* output_ptr = (__global data_type*)img3.ptr; \
|
||||
data_type dst; \
|
||||
\
|
||||
int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \
|
||||
data_type src = update_ptr[gidy * update_width + gidx]; \
|
||||
int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
|
||||
int loc = idx * output_width + gidx; \
|
||||
vxc_ushort8 mp1; \
|
||||
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
|
||||
VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
|
||||
uniU8MulAndPostShift1_Lo_2x8); \
|
||||
output_ptr[loc] = dst; \
|
||||
}
|
||||
SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16, vxc_ushort8)
|
||||
SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16, vxc_short8)
|
||||
|
||||
__kernel void scatter_nd_update_ref2out_F16toF16(
|
||||
__read_only image2d_t input_ref,
|
||||
image2d_t temp_ref,
|
||||
image2d_t output0
|
||||
)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
Image img0 = create_image_from_image2d(input_ref, 2);
|
||||
Image img1 = create_image_from_image2d(temp_ref, 2);
|
||||
__global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr;
|
||||
__global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr;
|
||||
out_ptr[gidx] = in_ptr[gidx];
|
||||
}
|
||||
|
||||
__kernel void scatter_nd_update_update2ref_F16toF16_16x(
|
||||
__read_only image2d_t input_index,
|
||||
__read_only image2d_t input_update,
|
||||
image2d_t temp_ref,
|
||||
image2d_t input0,
|
||||
image2d_t output1,
|
||||
int width, int area, int vol, int coord_dim
|
||||
)
|
||||
{
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
|
||||
Image img1 = create_image_from_image2d(input_index, 4);
|
||||
Image img2 = create_image_from_image2d(input_update, 2);
|
||||
Image img3 = create_image_from_image2d(temp_ref, 2);
|
||||
__global int* index_ptr = (__global int*)img1.ptr;
|
||||
__global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr;
|
||||
__global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr;
|
||||
|
||||
int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx);
|
||||
int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;
|
||||
int loc = idx * output_width + gidx;
|
||||
output_ptr[loc] = update_ptr[gidy * update_width + gidx];
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -29,6 +29,7 @@
|
|||
#include "VX/vx_ext_program.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
|
@ -198,10 +199,11 @@ static vsi_status vsi_nn_RegisterVXKernel
|
|||
vx_size * program_len = NULL;
|
||||
const char **program_src = NULL;
|
||||
vx_context ctx = NULL;
|
||||
vsi_nn_context_t context = NULL;
|
||||
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
|
||||
uint8_t i = 0;
|
||||
vsi_bool load_from_file = FALSE;
|
||||
vsi_nn_runtime_option_t* options;
|
||||
options = ((vsi_nn_graph_prv_t*)graph)->options;
|
||||
|
||||
#define MAX_BUILDPROGRAM_LEN 128
|
||||
char cmd[MAX_BUILDPROGRAM_LEN] = {0};
|
||||
|
|
@ -210,8 +212,7 @@ static vsi_status vsi_nn_RegisterVXKernel
|
|||
memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN);
|
||||
status = VSI_FAILURE;
|
||||
ctx = vxGetContext( (vx_reference)graph->g );
|
||||
context = graph->ctx;
|
||||
evis = context->config.evis.ver;
|
||||
evis = options->config.evis.ver;
|
||||
|
||||
program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
|
||||
CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
|
||||
|
|
@ -244,12 +245,12 @@ static vsi_status vsi_nn_RegisterVXKernel
|
|||
{
|
||||
// set default evis version is 2
|
||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
|
||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
|
||||
}
|
||||
else
|
||||
{
|
||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
|
||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
|
||||
}
|
||||
status = vxBuildProgram(program, cmd);
|
||||
|
||||
|
|
@ -302,7 +303,7 @@ static vsi_status vsi_nn_RegisterBinKernel
|
|||
vx_size program_len = 0;
|
||||
const uint8_t *program_ptr = NULL;
|
||||
vx_context ctx;
|
||||
vsi_nn_context_t context;
|
||||
vsi_nn_runtime_option_t* options;
|
||||
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
|
||||
|
||||
#define MAX_BUILDPROGRAM_LEN 128
|
||||
|
|
@ -313,8 +314,8 @@ static vsi_status vsi_nn_RegisterBinKernel
|
|||
status = VSI_FAILURE;
|
||||
|
||||
ctx = vxGetContext( (vx_reference)graph->g );
|
||||
context = graph->ctx;
|
||||
evis = context->config.evis.ver;
|
||||
options = ((vsi_nn_graph_prv_t*)graph)->options;
|
||||
evis = options->config.evis.ver;
|
||||
|
||||
program_ptr = vsi_nn_VxBinResourceGetResource(
|
||||
kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
|
||||
|
|
@ -337,12 +338,12 @@ static vsi_status vsi_nn_RegisterBinKernel
|
|||
{
|
||||
// set default evis version is 2
|
||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
|
||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
|
||||
}
|
||||
else
|
||||
{
|
||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
|
||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
|
||||
}
|
||||
#else
|
||||
snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension");
|
||||
|
|
|
|||
|
|
@ -35,6 +35,8 @@
|
|||
#include "utils/vsi_nn_constraint_check.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "vsi_nn_tensor_util_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
|
||||
static vsi_status _try_set_high_presision_tensor
|
||||
(
|
||||
|
|
@ -120,9 +122,22 @@ static vsi_status _static_batchnorm
|
|||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
#define _TENSOR_LEN 64
|
||||
vsi_status status;
|
||||
vsi_nn_kernel_param_t * param = NULL;
|
||||
vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
|
||||
uint32_t new_rank = 4;
|
||||
vsi_nn_tensor_t* input0 = NULL;
|
||||
vsi_nn_tensor_t* output = NULL;
|
||||
char reshape0_tensor_name[_TENSOR_LEN];
|
||||
char reshape1_tensor_name[_TENSOR_LEN];
|
||||
char batch_norm_tensor_name[_TENSOR_LEN];
|
||||
|
||||
memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
|
||||
memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
|
||||
memset(batch_norm_tensor_name, 0, sizeof(batch_norm_tensor_name));
|
||||
|
||||
status = VSI_FAILURE;
|
||||
|
||||
status = _try_set_high_presision_tensor(inputs);
|
||||
|
|
@ -131,10 +146,43 @@ static vsi_status _static_batchnorm
|
|||
VSILOGE("Set tensor attr of high presision fail");
|
||||
return status;
|
||||
}
|
||||
if(_require_reshape(self, inputs))
|
||||
if (_require_reshape(self, inputs))
|
||||
{
|
||||
reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input;
|
||||
reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output;
|
||||
if (3 == inputs[0]->attr.dim_num)
|
||||
{
|
||||
shape[0] = inputs[0]->attr.size[0];
|
||||
shape[1] = 1;
|
||||
shape[2] = inputs[0]->attr.size[1];
|
||||
shape[3] = inputs[0]->attr.size[2];
|
||||
}
|
||||
else if (5 == inputs[0]->attr.dim_num)
|
||||
{
|
||||
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
|
||||
shape[1] = inputs[0]->attr.size[2];
|
||||
shape[2] = inputs[0]->attr.size[3];
|
||||
shape[3] = inputs[0]->attr.size[4];
|
||||
}
|
||||
|
||||
input0 = vsi_nn_kernel_insert_reshape_node(self->graph,
|
||||
inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD);
|
||||
CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
|
||||
reshape_tensors[0] = input0;
|
||||
snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
|
||||
if (vxSetReferenceName((vx_reference)reshape_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
|
||||
{
|
||||
VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
|
||||
goto final;
|
||||
}
|
||||
output = vsi_nn_kernel_insert_reshape_node(self->graph,
|
||||
outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD);
|
||||
CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
|
||||
reshape_tensors[5] = output;
|
||||
snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
|
||||
if (vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
|
||||
{
|
||||
VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
|
||||
goto final;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -155,12 +203,26 @@ static vsi_status _static_batchnorm
|
|||
reshape_tensors, 5,
|
||||
&reshape_tensors[5], 1, param );
|
||||
|
||||
if( self->n )
|
||||
if ( self->n )
|
||||
{
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
vsi_nn_kernel_param_release( ¶m );
|
||||
vsi_nn_kernel_param_release(¶m);
|
||||
|
||||
if (output)
|
||||
{
|
||||
snprintf(batch_norm_tensor_name, sizeof(batch_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
|
||||
if (vxSetReferenceName((vx_reference)output->t, batch_norm_tensor_name) == VSI_FAILURE)
|
||||
{
|
||||
VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
|
||||
goto final;
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(input0);
|
||||
vsi_safe_release_tensor(output);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
|
@ -313,68 +375,6 @@ static vsi_status op_compute
|
|||
return status;
|
||||
} /* op_compute() */
|
||||
|
||||
static vsi_status op_optimize
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
vsi_nn_opt_direction_e direction
|
||||
)
|
||||
{
|
||||
uint32_t dim = 0;
|
||||
vsi_nn_batcnnorm_lcl_data *local = NULL;
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
|
||||
char tensor_name[128];
|
||||
|
||||
dim = inputs[0]->attr.dim_num;
|
||||
if(_require_reshape(self, inputs) == FALSE)
|
||||
{
|
||||
return VSI_SUCCESS;
|
||||
}
|
||||
|
||||
VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
|
||||
/*
|
||||
reshape 3d input (xcn) --> 4d input (whcn)
|
||||
reshape 3d output(xcn) --> 4d output(whcn)
|
||||
*/
|
||||
dim = 4;
|
||||
if (3 == inputs[0]->attr.dim_num)
|
||||
{
|
||||
shape[0] = inputs[0]->attr.size[0];
|
||||
shape[1] = 1;
|
||||
shape[2] = inputs[0]->attr.size[1];
|
||||
shape[3] = inputs[0]->attr.size[2];
|
||||
}
|
||||
else if (5 == inputs[0]->attr.dim_num)
|
||||
{
|
||||
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
|
||||
shape[1] = inputs[0]->attr.size[2];
|
||||
shape[2] = inputs[0]->attr.size[3];
|
||||
shape[3] = inputs[0]->attr.size[4];
|
||||
}
|
||||
local = self->nn_param.batch_norm.local;
|
||||
if (VSI_NN_OPTIMIZE_BACKWARD == direction)
|
||||
{
|
||||
local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
|
||||
}
|
||||
else
|
||||
{
|
||||
local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
|
||||
if(local->reshaped_output && local->reshaped_output->t)
|
||||
{
|
||||
memset(tensor_name, 0, sizeof(tensor_name));
|
||||
snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
|
||||
if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
|
||||
{
|
||||
VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return VSI_SUCCESS;
|
||||
} /* op_optimize() */
|
||||
|
||||
static vsi_bool _dynamic_check
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
|
|
@ -494,58 +494,6 @@ static vsi_bool op_check
|
|||
}
|
||||
} /* op_check() */
|
||||
|
||||
static vsi_bool op_setup
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_nn_batcnnorm_lcl_data *local = NULL;
|
||||
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||
{
|
||||
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
|
||||
memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
|
||||
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
|
||||
}
|
||||
|
||||
if(_require_reshape(self, inputs))
|
||||
{
|
||||
local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data));
|
||||
if(NULL == local)
|
||||
{
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
memset(local, 0, sizeof(vsi_nn_batcnnorm_lcl_data));
|
||||
self->nn_param.batch_norm.local = local;
|
||||
}
|
||||
return TRUE;
|
||||
} /* op_setup() */
|
||||
|
||||
static vsi_status op_deinit
|
||||
(
|
||||
vsi_nn_node_t * self
|
||||
)
|
||||
{
|
||||
vsi_nn_batch_norm_param *p = &(self->nn_param.batch_norm);
|
||||
if(p->local)
|
||||
{
|
||||
if (p->local->reshaped_input)
|
||||
{
|
||||
vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
|
||||
p->local->reshaped_input = NULL;
|
||||
}
|
||||
if (p->local->reshaped_output)
|
||||
{
|
||||
vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
|
||||
p->local->reshaped_output = NULL;
|
||||
}
|
||||
vsi_nn_safe_free(p->local);
|
||||
}
|
||||
vsi_nn_op_common_deinit(self);
|
||||
return VSI_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
@ -555,10 +503,10 @@ DEF_OP_REG
|
|||
/* op_name */ BATCH_NORM,
|
||||
/* init */ NULL,
|
||||
/* compute */ op_compute,
|
||||
/* deinit */ op_deinit,
|
||||
/* deinit */ vsi_nn_op_common_deinit,
|
||||
/* check */ op_check,
|
||||
/* setup */ op_setup,
|
||||
/* optimize */ op_optimize,
|
||||
/* setup */ vsi_nn_op_common_setup,
|
||||
/* optimize */ NULL,
|
||||
/* input_num */ 5,
|
||||
/* output_num */ 1
|
||||
);
|
||||
|
|
|
|||
|
|
@ -118,6 +118,7 @@ static vsi_bool op_setup
|
|||
if (outputs[0]->attr.dim_num == 0)
|
||||
{
|
||||
outputs[0]->attr.size[0] = 1;
|
||||
outputs[0]->attr.dim_num = 1;
|
||||
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -82,6 +82,7 @@ static vsi_bool op_check
|
|||
{
|
||||
BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1)
|
||||
IO_TYPE(D_U32, D_U32)
|
||||
IO_TYPE(D_I32, D_I32)
|
||||
IO_TYPE(D_F32, D_F32)
|
||||
IO_TYPE(D_F16, D_F16)
|
||||
IO_TYPE(D_BF16, D_BF16)
|
||||
|
|
|
|||
|
|
@ -253,6 +253,7 @@ static vsi_bool op_check
|
|||
IO_TYPE(D_BOOL8, D_I32)
|
||||
IO_TYPE(D_BOOL8, D_U16)
|
||||
IO_TYPE(D_BOOL8, D_U32)
|
||||
IO_TYPE(D_BOOL8, D_BF16)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_BOOL8)
|
||||
IO_TYPE(D_I8|Q_ASYM, D_BOOL8)
|
||||
IO_TYPE(D_I8|Q_DFP, D_BOOL8)
|
||||
|
|
|
|||
|
|
@ -155,10 +155,10 @@ vsi_bool vsi_nn_op_eltwise_setup
|
|||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||
vsi_bool ret = TRUE;
|
||||
|
||||
out_rank = inputs[0]->attr.dim_num;
|
||||
out_rank = vsi_nn_get_tensor_dims(inputs[0]);
|
||||
for ( i = 1; i < self->input.num; i++)
|
||||
{
|
||||
in2_rank = inputs[i]->attr.dim_num;
|
||||
in2_rank = vsi_nn_get_tensor_dims(inputs[i]);
|
||||
out_rank = vsi_nn_max( out_rank, in2_rank );
|
||||
}
|
||||
|
||||
|
|
@ -166,10 +166,10 @@ vsi_bool vsi_nn_op_eltwise_setup
|
|||
{
|
||||
vsi_size_t sz0, sz1;
|
||||
|
||||
sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1;
|
||||
sz0 = i < vsi_nn_get_tensor_dims(inputs[0]) ? inputs[0]->attr.size[i] : 1;
|
||||
for ( j = 1; j < self->input.num; j++)
|
||||
{
|
||||
sz1 = i < inputs[j]->attr.dim_num ? inputs[j]->attr.size[i] : 1;
|
||||
sz1 = i < vsi_nn_get_tensor_dims(inputs[j]) ? inputs[j]->attr.size[i] : 1;
|
||||
sz0 = vsi_nn_max( sz0, sz1 );
|
||||
if (sz0 != sz1 && sz0 != 1 && sz1 != 1)
|
||||
{
|
||||
|
|
@ -187,11 +187,12 @@ vsi_bool vsi_nn_op_eltwise_setup
|
|||
{
|
||||
outputs[0]->attr.dim_num = out_rank;
|
||||
memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
|
||||
if (out_rank == 1 &&
|
||||
vsi_nn_GetTensorIsScalar(inputs[0]) &&
|
||||
if (vsi_nn_GetTensorIsScalar(inputs[0]) &&
|
||||
vsi_nn_GetTensorIsScalar(inputs[1]))
|
||||
{
|
||||
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||
outputs[0]->attr.size[0] = 1;
|
||||
outputs[0]->attr.dim_num = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -199,6 +199,7 @@ static vsi_bool op_setup
|
|||
if (o_rank == 0)
|
||||
{
|
||||
outputs[0]->attr.size[0] = 1;
|
||||
outputs[0]->attr.dim_num = 1;
|
||||
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -306,6 +306,8 @@ static vsi_bool _op_check
|
|||
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP)
|
||||
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM)
|
||||
IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM)
|
||||
IO_TYPE(D_U16|Q_ASYM, D_F32, D_F32, D_U16|Q_ASYM)
|
||||
IO_TYPE(D_U16|Q_SYM, D_F32, D_F32, D_U16|Q_SYM)
|
||||
END_IO_TYPE_DECL(GROUP_NORM)
|
||||
if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
|
||||
{
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
|
|
@ -197,6 +198,7 @@ static vsi_bool op_setup_default
|
|||
vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
|
||||
vsi_nn_internal_tensor_t * h_times_r = NULL;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_activation_e recurrent_activation = p->recurrent_activation;
|
||||
|
||||
vsi_nn_internal_init_node_wksp( self );
|
||||
|
||||
|
|
@ -230,7 +232,8 @@ static vsi_bool op_setup_default
|
|||
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
|
||||
self->graph->ctx->config.support_stream_processor)
|
||||
(((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor &&
|
||||
recurrent_activation == VSI_NN_ACT_SIGMOID))
|
||||
{
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -93,37 +93,15 @@ static vsi_bool op_check
|
|||
{
|
||||
BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
|
||||
IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32)
|
||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F16)
|
||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_DFP)
|
||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_DFP)
|
||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_SYM)
|
||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_SYM)
|
||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_DFP)
|
||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_DFP)
|
||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_SYM)
|
||||
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_SYM)
|
||||
IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_BF16)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_I16|Q_DFP)
|
||||
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
|
||||
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_I16|Q_SYM)
|
||||
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_I8|Q_DFP)
|
||||
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
|
||||
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_I8|Q_SYM)
|
||||
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_F16)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F16)
|
||||
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_I16|Q_DFP)
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
|
|
@ -351,7 +352,7 @@ static vsi_bool op_setup
|
|||
}
|
||||
else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
|
||||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
|
||||
self->graph->ctx->config.support_stream_processor )
|
||||
((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
|
||||
{
|
||||
vsi_nn_internal_tensor_t* output_tensor = NULL;
|
||||
vsi_nn_internal_tensor_t* reshape_tensor = NULL;
|
||||
|
|
|
|||
|
|
@ -106,7 +106,7 @@ static vsi_bool op_setup
|
|||
|
||||
vsi_nn_internal_init_node_wksp( self );
|
||||
|
||||
if ( axis != 0 && !self->graph->ctx->config.support_stream_processor)
|
||||
if ( axis != 0 && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
|
||||
{
|
||||
vsi_nn_internal_tensor_t* mean_tensor = NULL;
|
||||
vsi_nn_internal_tensor_t* vari_tensor = NULL;
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_types_prv.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
|
|
@ -139,7 +140,7 @@ static vsi_bool op_setup
|
|||
|
||||
p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL;
|
||||
p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL;
|
||||
if (self->graph->ctx->config.support_stream_processor)
|
||||
if (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
|
||||
{
|
||||
p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -100,6 +100,7 @@ static vsi_bool op_check
|
|||
IO_TYPE(D_I32, D_I16|Q_ASYM)
|
||||
IO_TYPE(D_I32, D_I16|Q_SYM)
|
||||
IO_TYPE(D_I32, D_I32)
|
||||
IO_TYPE(D_I32, D_BF16)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP)
|
||||
|
|
@ -111,8 +112,10 @@ static vsi_bool op_check
|
|||
IO_TYPE(D_U8|Q_ASYM, D_BF16)
|
||||
IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
|
||||
IO_TYPE(D_I8|Q_ASYM, D_F16)
|
||||
IO_TYPE(D_I8|Q_ASYM, D_BF16)
|
||||
IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
|
||||
IO_TYPE(D_I8|Q_DFP, D_F16)
|
||||
IO_TYPE(D_I8|Q_DFP, D_BF16)
|
||||
IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP)
|
||||
IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
|
||||
|
|
@ -124,11 +127,14 @@ static vsi_bool op_check
|
|||
IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
|
||||
IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM)
|
||||
IO_TYPE(D_I16|Q_ASYM, D_F16)
|
||||
IO_TYPE(D_I16|Q_ASYM, D_BF16)
|
||||
IO_TYPE(D_I16|Q_ASYM, D_F32)
|
||||
IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM)
|
||||
IO_TYPE(D_I16|Q_SYM, D_BF16)
|
||||
IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM)
|
||||
IO_TYPE(D_I8|Q_SYM, D_F16)
|
||||
IO_TYPE(D_I8|Q_SYM, D_BF16)
|
||||
IO_TYPE(D_BF16, D_BF16)
|
||||
END_IO_TYPE_DECL(ONE_HOT)
|
||||
if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_constraint_check.h"
|
||||
#include "vsi_nn_error.h"
|
||||
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
|
|
@ -50,33 +51,52 @@ static vsi_status op_compute
|
|||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_param_t * param = NULL;
|
||||
vsi_nn_kernel_node_t n = NULL;
|
||||
param =vsi_nn_kernel_param_create();
|
||||
vsi_nn_tensor_t* reshape_tensor = NULL;
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||
vsi_nn_pre_process_rgb_param* p = NULL;
|
||||
|
||||
vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb.local.scale_x );
|
||||
vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb.local.scale_y );
|
||||
vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb.rect.left );
|
||||
vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb.rect.top );
|
||||
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean );
|
||||
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean );
|
||||
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean );
|
||||
vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale );
|
||||
vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale );
|
||||
vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale );
|
||||
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel );
|
||||
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm );
|
||||
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy );
|
||||
n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", inputs, 1, outputs, 1, param );
|
||||
if( n != NULL )
|
||||
memcpy(shape, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t));
|
||||
|
||||
shape[0] = shape[1] * shape[0];
|
||||
shape[1] = shape[2];
|
||||
shape[2] = 1;
|
||||
|
||||
reshape_tensor = vsi_nn_reshape_tensor(self->graph,
|
||||
inputs[0], shape, inputs[0]->attr.dim_num);
|
||||
CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor failed", final);
|
||||
|
||||
p = (vsi_nn_pre_process_rgb_param*)&(self->nn_param.pre_process_rgb);
|
||||
|
||||
param = vsi_nn_kernel_param_create();
|
||||
|
||||
vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x );
|
||||
vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y );
|
||||
vsi_nn_kernel_param_add_int32( param, "left", p->rect.left );
|
||||
vsi_nn_kernel_param_add_int32( param, "top", p->rect.top );
|
||||
vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean );
|
||||
vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean );
|
||||
vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean );
|
||||
vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale );
|
||||
vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale );
|
||||
vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale );
|
||||
vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel );
|
||||
vsi_nn_kernel_param_add_int32( param, "enable_perm", p->local->enable_perm );
|
||||
vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy );
|
||||
n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", &reshape_tensor, 1, outputs, 1, param );
|
||||
if ( n != NULL )
|
||||
{
|
||||
self->n = (vx_node)n;
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
if(param != NULL)
|
||||
if (param != NULL)
|
||||
{
|
||||
vsi_nn_kernel_param_release( ¶m );
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(reshape_tensor);
|
||||
|
||||
return status;
|
||||
} /* op_compute() */
|
||||
|
||||
|
|
@ -166,35 +186,57 @@ static vsi_bool op_setup
|
|||
}
|
||||
|
||||
|
||||
self->nn_param.pre_process_rgb.local.enable_perm = FALSE;
|
||||
p->local->enable_perm = FALSE;
|
||||
|
||||
if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE)
|
||||
if (p->local->enable_perm == FALSE)
|
||||
{
|
||||
p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
|
||||
p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
|
||||
p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
|
||||
p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
|
||||
p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
|
||||
p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
|
||||
p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
|
||||
}
|
||||
|
||||
p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15)));
|
||||
p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
|
||||
|
||||
return TRUE;
|
||||
} /* op_setup() */
|
||||
|
||||
static vsi_status op_init
|
||||
(
|
||||
vsi_nn_node_t* self
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
|
||||
self->nn_param.pre_process_rgb.local =
|
||||
(vsi_nn_pre_process_rgb_lcl_data*)malloc(sizeof(vsi_nn_pre_process_rgb_lcl_data));
|
||||
|
||||
if (NULL == self->nn_param.pre_process_rgb.local)
|
||||
{
|
||||
return VX_ERROR_NO_MEMORY;
|
||||
}
|
||||
|
||||
memset(self->nn_param.pre_process_rgb.local, 0, sizeof(vsi_nn_pre_process_rgb_lcl_data));
|
||||
|
||||
return status;
|
||||
} /* op_init() */
|
||||
|
||||
static vsi_status op_deinit
|
||||
(
|
||||
vsi_nn_node_t * self
|
||||
)
|
||||
{
|
||||
if (self->nn_param.pre_process_rgb.local.local_tensor != NULL)
|
||||
if (self->nn_param.pre_process_rgb.local->local_tensor != NULL)
|
||||
{
|
||||
vxReleaseTensor(&self->nn_param.pre_process_rgb.local.local_tensor);
|
||||
self->nn_param.pre_process_rgb.local.local_tensor = NULL;
|
||||
vxReleaseTensor(&self->nn_param.pre_process_rgb.local->local_tensor);
|
||||
self->nn_param.pre_process_rgb.local->local_tensor = NULL;
|
||||
}
|
||||
|
||||
vsi_nn_safe_free(self->nn_param.pre_process_rgb.local);
|
||||
|
||||
vsi_nn_op_common_deinit(self);
|
||||
|
||||
return VSI_SUCCESS;
|
||||
|
|
@ -208,7 +250,7 @@ extern "C" {
|
|||
DEF_OP_REG
|
||||
(
|
||||
/* op_name */ PRE_PROCESS_RGB,
|
||||
/* init */ NULL,
|
||||
/* init */ op_init,
|
||||
/* compute */ op_compute,
|
||||
/* deinit */ op_deinit,
|
||||
/* check */ op_check,
|
||||
|
|
|
|||
|
|
@ -79,7 +79,10 @@ static vsi_status _prelu_op_compute
|
|||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_prelu_param *prelu = &self->nn_param.prelu;
|
||||
vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 };
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
vsi_nn_tensor_t* input0 = NULL;
|
||||
vsi_nn_tensor_t* input1 = NULL;
|
||||
vsi_nn_tensor_t* output = NULL;
|
||||
vsi_bool one_rank = FALSE;
|
||||
vsi_bool is_per_channel_alpha = 0;
|
||||
vsi_size_t alpha_shape = 1;
|
||||
|
|
@ -88,6 +91,7 @@ static vsi_status _prelu_op_compute
|
|||
uint32_t dims = outputs[0]->attr.dim_num;
|
||||
|
||||
reshape_tensors[0] = inputs[0];
|
||||
reshape_tensors[2] = outputs[0];
|
||||
one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape);
|
||||
|
||||
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
|
||||
|
|
@ -114,18 +118,23 @@ static vsi_status _prelu_op_compute
|
|||
dims = inputs[1]->attr.dim_num;
|
||||
}
|
||||
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
|
||||
input1 = vsi_nn_reshape_tensor( self->graph,
|
||||
inputs[1], (vsi_size_t*)shapes, dims );
|
||||
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
|
||||
reshape_tensors[1] = input1;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
|
||||
input1 = vsi_nn_reshape_tensor( self->graph,
|
||||
inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num );
|
||||
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
|
||||
reshape_tensors[1] = input1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t rank = inputs[0]->attr.dim_num;
|
||||
dims = inputs[1]->attr.dim_num;
|
||||
|
||||
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
|
||||
|
|
@ -141,9 +150,32 @@ static vsi_status _prelu_op_compute
|
|||
shapes[1] = 1;
|
||||
dims = 2;
|
||||
}
|
||||
else if (one_rank && inputs[1]->attr.is_const == TRUE &&
|
||||
alpha_shape == inputs[0]->attr.size[0] &&
|
||||
alpha_shape == inputs[1]->attr.size[0] &&
|
||||
rank < 3)
|
||||
{
|
||||
is_per_channel_alpha = TRUE;
|
||||
shapes[0] = 1;
|
||||
shapes[1] = 1;
|
||||
shapes[2] = alpha_shape;
|
||||
shapes[3] = rank > 1 ? inputs[0]->attr.size[1] : 1;
|
||||
dims = 4;
|
||||
input0 = vsi_nn_reshape_tensor(self->graph, inputs[0], (vsi_size_t*)shapes, dims);
|
||||
CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
|
||||
reshape_tensors[0] = input0;
|
||||
output = vsi_nn_reshape_tensor(self->graph, outputs[0], (vsi_size_t*)shapes, dims);
|
||||
CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
|
||||
reshape_tensors[2] = output;
|
||||
shapes[0] = alpha_shape;
|
||||
shapes[1] = 1;
|
||||
dims = 2;
|
||||
}
|
||||
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
|
||||
input1 = vsi_nn_reshape_tensor( self->graph,
|
||||
inputs[1], (vsi_size_t*)shapes, dims );
|
||||
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
|
||||
reshape_tensors[1] = input1;
|
||||
}
|
||||
|
||||
// Add params
|
||||
|
|
@ -153,15 +185,19 @@ static vsi_status _prelu_op_compute
|
|||
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
|
||||
kernel_name,
|
||||
&reshape_tensors[0], 2,
|
||||
outputs, 1, param );
|
||||
&reshape_tensors[2], 1, param );
|
||||
|
||||
vsi_nn_kernel_param_release( ¶m );
|
||||
vsi_nn_ReleaseTensor( &reshape_tensors[1] );
|
||||
if( self->n )
|
||||
if ( self->n )
|
||||
{
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_safe_release_tensor(input0);
|
||||
vsi_safe_release_tensor(input1);
|
||||
vsi_safe_release_tensor(output);
|
||||
|
||||
return status;
|
||||
} /* _prelu_op_compute() */
|
||||
|
||||
|
|
@ -211,28 +247,36 @@ static vsi_bool op_check
|
|||
)
|
||||
{
|
||||
BEGIN_IO_TYPE_DECL(PRELU, 2, 1)
|
||||
IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F16, D_I16|Q_DFP)
|
||||
IO_TYPE(D_F16, D_F16, D_I8|Q_DFP)
|
||||
IO_TYPE(D_F16, D_F16, D_F16)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
|
||||
IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP)
|
||||
IO_TYPE(D_I8|Q_DFP, D_F16, D_F16)
|
||||
IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
|
||||
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
|
||||
IO_TYPE(D_BF16, D_F16, D_BF16)
|
||||
IO_TYPE(D_BF16, D_BF16, D_BF16)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
|
||||
IO_TYPE(D_F32, D_F32, D_F32)
|
||||
IO_TYPE(D_I32, D_I32, D_I32)
|
||||
IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_F16, D_F16, D_I16|Q_DFP)
|
||||
IO_TYPE(D_F16, D_F16, D_I8|Q_DFP)
|
||||
IO_TYPE(D_F16, D_F16, D_F16)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
|
||||
IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP)
|
||||
IO_TYPE(D_I8|Q_DFP, D_F16, D_F16)
|
||||
IO_TYPE(D_I8|Q_SYM, D_F16, D_I8|Q_SYM)
|
||||
IO_TYPE(D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM)
|
||||
IO_TYPE(D_I8|Q_SYM, D_F16, D_F16)
|
||||
IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16)
|
||||
IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
|
||||
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
|
||||
IO_TYPE(D_I16|Q_SYM, D_F16, D_I16|Q_SYM)
|
||||
IO_TYPE(D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM)
|
||||
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16)
|
||||
IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16)
|
||||
IO_TYPE(D_BF16, D_F16, D_BF16)
|
||||
IO_TYPE(D_BF16, D_BF16, D_BF16)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
|
||||
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
|
||||
IO_TYPE(D_F32, D_F32, D_F32)
|
||||
IO_TYPE(D_I32, D_I32, D_I32)
|
||||
|
||||
/* HW 9.0 */
|
||||
IO_TYPE(D_F32, D_BF16, D_BF16)
|
||||
IO_TYPE(D_BF16, D_BF16, D_F32)
|
||||
IO_TYPE(D_F32, D_BF16, D_BF16)
|
||||
IO_TYPE(D_BF16, D_BF16, D_F32)
|
||||
END_IO_TYPE_DECL(PRELU)
|
||||
if(!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
|
||||
if (!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
|
||||
char* desc = generate_op_io_types_desc(inputs,
|
||||
self->input.num, outputs, self->output.num);
|
||||
VSILOGE("Inputs/Outputs data type not support: %s", desc);
|
||||
|
|
|
|||
|
|
@ -162,7 +162,7 @@ static vsi_bool _check_is_sp_supported_type
|
|||
int32_t * axes = self->nn_param.reduce.local2->axes;
|
||||
int32_t axes_num = self->nn_param.reduce.local2->axes_num;
|
||||
|
||||
if ( !self->graph->ctx->config.support_stream_processor ||
|
||||
if ( !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
|
||||
(type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
|
||||
{
|
||||
return FALSE;
|
||||
|
|
@ -788,7 +788,7 @@ static vsi_bool op_set_reduce_axis(
|
|||
}
|
||||
*out_rank_x = inputs[0]->attr.dim_num;
|
||||
}
|
||||
else if (!self->graph->ctx->config.support_stream_processor ||
|
||||
else if (!((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
|
||||
resolved_dim_count > 2)
|
||||
{
|
||||
optimzation_input_size(
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ static vsi_status op_compute
|
|||
vx_nn_reshape_params_t reshape_param;
|
||||
|
||||
memset(&attr, 0, sizeof(attr));
|
||||
attr.size[0] = self->nn_param.reshape.dim_num;
|
||||
attr.size[0] = vsi_nn_max(self->nn_param.reshape.dim_num, 1);
|
||||
attr.dim_num = 1;
|
||||
attr.is_const = TRUE;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
||||
|
|
@ -124,17 +124,28 @@ static vsi_bool op_setup
|
|||
vsi_bool ret = TRUE;
|
||||
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||
{
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
uint32_t i = 0;
|
||||
for (i = 0; i < self->nn_param.reshape.dim_num; i++)
|
||||
if (self->nn_param.reshape.dim_num == 0 ||
|
||||
self->nn_param.reshape.size == NULL
|
||||
)
|
||||
{
|
||||
shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
|
||||
(vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
|
||||
outputs[0]->attr.size[0] = 1;
|
||||
outputs[0]->attr.dim_num = 1;
|
||||
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||
}
|
||||
else
|
||||
{
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||
uint32_t i = 0;
|
||||
for (i = 0; i < self->nn_param.reshape.dim_num; i++)
|
||||
{
|
||||
shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
|
||||
(vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
|
||||
}
|
||||
ret = vsi_nn_CalcReshapeTensor(inputs[0],
|
||||
outputs[0],
|
||||
shape,
|
||||
self->nn_param.reshape.dim_num);
|
||||
}
|
||||
ret = vsi_nn_CalcReshapeTensor(inputs[0],
|
||||
outputs[0],
|
||||
shape,
|
||||
self->nn_param.reshape.dim_num);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ static vsi_status op_compute
|
|||
}
|
||||
|
||||
memset(&attr, 0, sizeof(attr));
|
||||
attr.size[0] = self->nn_param.reshape2.dim_num;
|
||||
attr.size[0] = vsi_nn_max(self->nn_param.reshape2.dim_num, 1);
|
||||
attr.dim_num = 1;
|
||||
attr.is_const = TRUE;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
||||
|
|
@ -161,13 +161,24 @@ static vsi_bool op_setup
|
|||
vsi_bool ret = TRUE;
|
||||
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||
{
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
memcpy(shape, self->nn_param.reshape2.size,
|
||||
sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
|
||||
ret = vsi_nn_CalcReshapeTensor(inputs[0],
|
||||
outputs[0],
|
||||
shape,
|
||||
self->nn_param.reshape2.dim_num);
|
||||
if (self->nn_param.reshape2.dim_num == 0 ||
|
||||
self->nn_param.reshape2.size == NULL
|
||||
)
|
||||
{
|
||||
outputs[0]->attr.size[0] = 1;
|
||||
outputs[0]->attr.dim_num = 1;
|
||||
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
|
||||
}
|
||||
else
|
||||
{
|
||||
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||
memcpy(shape, self->nn_param.reshape2.size,
|
||||
sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
|
||||
ret = vsi_nn_CalcReshapeTensor(inputs[0],
|
||||
outputs[0],
|
||||
shape,
|
||||
self->nn_param.reshape2.dim_num);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue